#!/bin/sh
#******************************* License GPLv3 ********************************
# Factory_paquets_de_CRIs_gen : Factory to build one paquet-de-CRIs from the website www.les-cris.com, with a timestamp, and and a SHA256 signature of the package.
# Copyright (C) 2022  Antoine Herzog <info -!at-arobase=! les-cris -!dot-point=! com>.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
#******************************************************************************

# Launch the HTTrack retrival of this web site.

# To keep the folder before the processing (that will need to cd to other folder).
SCRIPT_WEBSITE_COPIER_HOME_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

# For debug. Set it to "true" to activate the mode debug.
### SCRIPTCTXT_DEBUG="true"
SCRIPTCTXT_DEBUG=""
# For testing. Set it to "true" to activate the mode testing.
# Testing is, as an example, with retrieval of only one page, of the web site, to make the test quickly.
# Note : for HTTrack, the value is defined in the HTTrack launch script.
SCRIPTCTXT_TESTING_GEN="false"
### SCRIPTCTXT_TESTING_GEN="true"

# FOR ERASING FORMER RESULT AND OVERWRITE THE WEBSITE COPY.
# shall be "true", if not, will not erase and copy the web site inside a timestamped folder.
DEST_OUTPUT_OVERWRITE_FORMER_WEBSITE_COPY="false"
# Safety : If Overwrite is True, then this variable will be checked against "DEST_OUTPUT_FOLDER_ROOT", to make sure not to erase another folder ("rm -rf ..." !).
#DEST_OUTPUT_OVERWRITE_FORMER_WEBSITE_COPY_FOLDER_ROOT_FOR_CHECKING="/home/home_strghd/ant/Dev_Misc/SiteWeb/SiteWeb-Archives/SiteWeb-Archives-sites-aspires-220122-paquet-de-CRIs/smo-Essais-sites-internet/Smo_614_Gers-Numerique_com_TOTOTITITOTO"
DEST_OUTPUT_FOLDER_ADD_DATETIMESTAMP="true"
# The predefined date and timestamp, to use. 
# If the value is null or not defined, a value is calculated.
# If the script is called by the script for the build of Paquet-de-CRIs, then the timestamps is yet defined.
if [ "x${THIS_PAQUET_DE_CRIS_TIMESTAMP_USUALFORMAT}" = "x" ]
then
	echo ""
	echo "#--------------------ERROR starting HTTrack process ----------------------"
	echo "ERROR : the variable THIS_PAQUET_DE_CRIS_TIMESTAMP_USUALFORMAT is not set. It should have been set by the calling script"
	echo ""
	echo "ERROR : the HTTrack process will end with exit code : -63."
	
	echo "#--------------------ERROR starting HTTrack process ----------------------"
	# DEST_OUTPUT_FOLDER_PREDEFINED_DATETIMESTAMP=`date +%Y-%m-%d_%Hh%Mm`				# Datestamp e.g 2002-09-21_19h26m
	exit -63;
else
	DEST_OUTPUT_FOLDER_PREDEFINED_DATETIMESTAMP="${THIS_PAQUET_DE_CRIS_TIMESTAMP_USUALFORMAT}"
	echo ""
	echo "DEST_OUTPUT_FOLDER_PREDEFINED_DATETIMESTAMP=${DEST_OUTPUT_FOLDER_PREDEFINED_DATETIMESTAMP}"
fi

# The HTTrack application is installed in the /usr folder.
# Httrack is installed with DNF (Yum), with the Fedora repository, or with pkg_add with the OpenBSD repository.
APPLICATION_HTTRACK_HOME_DIR=/usr/bin
APPLICATION_HTTRACK_EXEC_NAME=httrack

WEB_SITE_NAME_USUAL="les-CRIs_com"
# Usually, this is the name of the WebSite copied with HTTrack in these tool script.
WEB_SITE_NAME_COPIED_USUAL="$WEB_SITE_NAME_USUAL-website-copied"
# For the Paquet-de-CRIs, this is the name. The timestamp is added in the general processing script.
# WEB_SITE_NAME_COPIED_USUAL="Paquet_de_CRIs_local"
# The predefined WEB_SITE_NAME_COPIED_USUAL, to use. 
# If the value is null or not defined, a value is calculated.
# If the script is called by the script for the build of Paquet-de-CRIs, then the timestamps is yet defined.
if [ "x${FOR_HTTRACK_PROCESS_WEB_SITE_NAME_COPIED_USUAL_VALUE}" = "x" ]
then
	echo ""
	echo "#--------------------ERROR starting HTTrack process ----------------------"
	echo "ERROR : the variable FOR_HTTRACK_PROCESS_WEB_SITE_NAME_COPIED_USUAL_VALUE is not set. It should have been set by the calling script"
	echo ""
	echo "ERROR : the HTTrack process will end with exit code : -75."
	
	echo "#--------------------ERROR starting HTTrack process ----------------------"
	exit -75;
else
	WEB_SITE_NAME_COPIED_USUAL="${FOR_HTTRACK_PROCESS_WEB_SITE_NAME_COPIED_USUAL_VALUE}"
	echo ""
	echo "WEB_SITE_NAME_COPIED_USUAL=${WEB_SITE_NAME_COPIED_USUAL}"
fi


# The url.
WEB_SITE_URL_BASE="les-cris.com"
# Example of url, for getting only one page. For debug and testing.
if [ "x${SCRIPTCTXT_TESTING_GEN}" = "xtrue" ]
then
	WEB_SITE_URL_BASE="les-cris.com/pages-110-au-fil-du-temps/les-ponctuels-2021/cri-au-fil-du-temps-210724-A-Liberte_Liberte_cherie_avec_l_Electronic_Frontier_Fondation.php"
	echo ""
	echo "WARN: Testing mode gen."
	echo "Only one page is retrieved from the Les-cris.com website. This makes the test quick."
	echo ""
	echo "WEB_SITE_URL_BASE=${WEB_SITE_URL_BASE}"
	
	echo "End of WARN: Testing mode gen."
fi

WEB_SITE_URL_GEN="https://www.$WEB_SITE_URL_BASE"
# For the DEST_OUTPUT_FOLDER_ROOT
if [ "x${FOR_HTTRACK_PROCESS_DEST_OUTPUT_FOLDER_ROOT_BASE_VALUE}" = "x" ]
then
	echo ""
	echo "#--------------------ERROR starting HTTrack process ----------------------"
	echo "ERROR : the variable FOR_HTTRACK_PROCESS_DEST_OUTPUT_FOLDER_ROOT_BASE_VALUE is not set. It should have been set by the calling script"
	echo ""
	echo "ERROR : the HTTrack process will end with exit code : -49."
	echo "#--------------------ERROR starting HTTrack process ----------------------"
	
	exit -49
else
	# DEST_OUTPUT_FOLDER_ROOT="/...xxx...../paquet-de-CRIs/${WEB_SITE_NAME_USUAL}"
	DEST_OUTPUT_FOLDER_ROOT="${FOR_HTTRACK_PROCESS_DEST_OUTPUT_FOLDER_ROOT_BASE_VALUE}"
	echo "DEST_OUTPUT_FOLDER_ROOT=${DEST_OUTPUT_FOLDER_ROOT}"
fi

DEST_OUTPUT_FOLDER_NAME="$WEB_SITE_NAME_COPIED_USUAL"

# The user and groups, for the httrack process.
# Use the current user and group that launched the script.
# "${UG_HTTRACKPROCESS_USER_NAME}:${UG_HTTRACKPROCESS_GROUP_NAME}"
UG_HTTRACKPROCESS_USER_NAME="$(whoami)"
UG_HTTRACKPROCESS_GROUP_NAME="$(id -g -n)"

# The grep shell command.
GREP_TOOL_SHELL_COMMAND="/usr/bin/grep"

# For options, see the Documentation and Faq : http://www.httrack.com/html/fcguide.html

# Options TRACK_SCOPE : TRACK_SCOPE options
APP_OPTION_TRACK_SCOPE=""
# not going beyond the bounds of all the files in the WEB_SITE_URL_BASE domain
### APP_OPTION_TRACK_SCOPE="$APP_OPTION_TRACK_SCOPE +*.$WEB_SITE_URL_BASE/* +*my-pages-to-retrieve* "
APP_OPTION_TRACK_SCOPE="$APP_OPTION_TRACK_SCOPE +*.$WEB_SITE_URL_BASE/*"

# Options 01 : Action options
APP_OPTION_01=""
# Option : w *mirror web sites (--mirror)
### APP_OPTION_01="$APP_OPTION_01 --mirror"
# Option : W  mirror web sites, semi-automatic (asks questions) (--mirror-wizard)
### APP_OPTION_01="$APP_OPTION_01 --mirror-wizard"
# Option : g  just get files (saved in the current directory) (--get-files)
### APP_OPTION_01="$APP_OPTION_01 --get-files"
# Option : Y   mirror ALL links located in the first level pages (mirror links) (--mirrorlinks)
### APP_OPTION_01="$APP_OPTION_01 --mirrorlinks"


# Options 02 : Limits options
APP_OPTION_02=""
# Option : rN set the mirror depth to N (* r9999) (--depth[=N])
### APP_OPTION_02="$APP_OPTION_02 --depth=999"
# Option : %eN set the external links depth to N (* %e0) (--ext-depth[=N])
### APP_OPTION_02="$APP_OPTION_02 --ext-depth=1"
# Option : mN maximum file length for a non-html file (--max-files[=N])
#		   mN,N'                  for non html (N) and html (N')
### APP_OPTION_02="$APP_OPTION_02 --max-files=1024,1024"
# Option : MN maximum overall size that can be uploaded/scanned (--max-size[=N])
### APP_OPTION_02="$APP_OPTION_02 --max-size=1024"
# Option : EN maximum mirror time in seconds (60=1 minute, 3600=1 hour) (--max-time[=N])
### APP_OPTION_02="$APP_OPTION_02 --max-time=3600"
# Option : AN maximum transfer rate in bytes/seconds (1000=1kb/s max) (--max-rate[=N])
### APP_OPTION_02="$APP_OPTION_02 --max-rate=1000"
### --max-rate=3000000 : 3 Mb/s, so approximatively 375 ko/s. 
### --max-rate=9000000 : 9 Mb/sec, so approximatively 1125 ko/sec, so 1.125 Mo/sec. 
### --max-rate=90000000 : 90 Mb/sec, so approximatively 10,73 Mo/sec. 
APP_OPTION_02="$APP_OPTION_02 --max-rate=90000000"
# Option :  -%!    bypass built-in security limits aimed to avoid  bandwith  abuses (bandwidth,  simultaneous  connections)(--disable-security-limits)
# Be carefull with this option : requesting to fast can create a denial of service attack against the site.
### APP_OPTION_02="$APP_OPTION_02 --disable-security-limits"

# Option : %cN maximum number of connections/seconds (*%c10)
#------------- Very special for this WebSite... --------- 
################## APP_OPTION_02="$APP_OPTION_02 -%c1"

# Option : GN pause transfer if N bytes reached, and wait until lock file is deleted (--max-pause[=N])
### APP_OPTION_02="$APP_OPTION_02 --max-pause=1024"

# Options 04 : Flow control
APP_OPTION_04=""

# Option : cN number of multiple connections (*c8) (--sockets[=N])
#------------- Very special for this WebSite... --------- 
################## APP_OPTION_04="$APP_OPTION_04 --sockets=1"

# Option : TN timeout, number of seconds after a non-responding link is shutdown (--timeout)
APP_OPTION_04="$APP_OPTION_04 --timeout=60"
# Option : RN number of retries, in case of timeout or non-fatal errors (*R1) (--retries[=N])
APP_OPTION_04="$APP_OPTION_04 --retries=4"
# Option : JN traffic jam control, minimum transfert rate (bytes/seconds) tolerated for a link (--min-rate[=N])
### APP_OPTION_04="$APP_OPTION_04 --min-rate=1024"
# Option : HN host is abandonned if: 0=never, 1=timeout, 2=slow, 3=timeout or slow (--host-control[=N])
### APP_OPTION_04="$APP_OPTION_04 --host-control=0"

# Options 06 : Links options
APP_OPTION_06=""
# Option : %P *extended parsing, attempt to parse all links, even in unknown tags or Javascript (%P0 don't use) (--extended-parsing[=N])
### APP_OPTION_06="$APP_OPTION_06 --extended-parsing=0"
# Option : n  get non-html files 'near' an html file (ex: an image located outside) (--near)
### APP_OPTION_06="$APP_OPTION_06 --near"
# Option : t  test all URLs (even forbidden ones) (--test)
### APP_OPTION_06="$APP_OPTION_06 --test"

# Options Build : Build options
# TODO, see the Documentation : http://www.httrack.com/html/fcguide.html
APP_OPTION_BUILD=""
# Option : −LN long names (L1 *long names / L0 8−3 conversion / L2 ISO9660 compatible) (−−long−names[=N])
APP_OPTION_BUILD="$APP_OPTION_BUILD --long-names=1"

# Option : 
### APP_OPTION_BUILD="$APP_OPTION_BUILD "
# Option : 
### APP_OPTION_BUILD="$APP_OPTION_BUILD "

# Options SPIDER : Spider options
# TODO, see the Documentation : http://www.httrack.com/html/fcguide.html
APP_OPTION_SPIDER=""
# Option :  −%A assume that a type (cgi,asp..) is always linked with a mime type (−%A php3,cgi=text/html;dat,bin=application/x−zip) (−−assume <param>)
# See also, about the speed of download : in the FAQ :
# Q: HTTrack is taking too much time for parsing, it is very slow. What's wrong?
# https://www.httrack.com/html/faq.html#QP3
APP_OPTION_SPIDER="$APP_OPTION_SPIDER --assume php3=text/html,php=text/html,css=text/css"
# Option : 
### APP_OPTION_SPIDER="$APP_OPTION_SPIDER "
# Option : 
### APP_OPTION_SPIDER="$APP_OPTION_SPIDER "

# Options 07 : Browser ID
# TODO, see the Documentation : http://www.httrack.com/html/fcguide.html
APP_OPTION_07=""
# Option : 
### APP_OPTION_07="$APP_OPTION_07 "
# Option : 
### APP_OPTION_07="$APP_OPTION_07 "
# Option : 
### APP_OPTION_07="$APP_OPTION_07 "

# Options 08 : Log, index, cache
APP_OPTION_08=""
# Option : %v  display on screen filenames downloaded (in realtime) (--display)
APP_OPTION_08="$APP_OPTION_08 --display"
# Option : Q  no log - quiet mode (--do-not-log)
# APP_OPTION_08="$APP_OPTION_08 --do-not-log"
# Option : q  no questions - quiet mode (--quiet)
# APP_OPTION_08="$APP_OPTION_08 --quiet"
# Option : z  log - extra infos (--extra-log)
APP_OPTION_08="$APP_OPTION_08 --extra-log"
# Option : Z  log - debug (--debug-log)
# APP_OPTION_08="$APP_OPTION_08 --debug-log"
# Option : v  log on screen (--verbose)
# Becarefull with the --verbose and the --file-log options : Only the last one of the two is the one that is taken into configuration.
# Note : the --file-log is set in the script that launch HTTrack, with the file name added to it.
APP_OPTION_08="$APP_OPTION_08 --verbose"
# Option : *log in files (--file-log)
# Note : this is added here so that the --verbose is not enabled, and so that the log files are written (instead of logs in the terminal/console).
APP_OPTION_08="$APP_OPTION_08 --file-log"
# Option : f2 one single log file (--single-log)
# APP_OPTION_08="$APP_OPTION_08 --single-log"
# Option : I *make an index (I0 don't make) (--index)
# APP_OPTION_08="$APP_OPTION_08 -I0"
# Option : %I  make an searchable index for this mirror (* %I0 don't make) (--search-index)
# APP_OPTION_08="$APP_OPTION_08 --search-index"


# Options last :
APP_OPTION_10=""
# Option : 
# APP_OPTION_10="$APP_OPTION_10 -"

if [ "x${SCRIPTCTXT_DEBUG}" = "xtrue" ]
then
	# For debug.
echo 
echo ========================================================================================
echo 
echo Configuration of HTTrack :
echo
echo "APPLICATION_HTTRACK_HOME_DIR=$APPLICATION_HTTRACK_HOME_DIR"
echo "APPLICATION_HTTRACK_EXEC_NAME=$APPLICATION_HTTRACK_EXEC_NAME"
echo "DEST_OUTPUT_OVERWRITE_FORMER_WEBSITE_COPY=$DEST_OUTPUT_OVERWRITE_FORMER_WEBSITE_COPY"
echo "DEST_OUTPUT_FOLDER_ADD_DATETIMESTAMP=$DEST_OUTPUT_FOLDER_ADD_DATETIMESTAMP"
echo "=$"
echo "=$"
echo "=$"
echo
echo
echo ========================================================================================
echo

fi

# Launch the HTTrack command in the general script.
. "$SCRIPT_WEBSITE_COPIER_HOME_DIR/httrack-launch-010-Gen.sh"



