#!/bin/sh
#******************************* License GPLv3 ********************************
# Factory_paquets_de_CRIs_gen : Factory to build one paquet-de-CRIs from the website www.les-cris.com, with a timestamp, and and a SHA256 signature of the package.
# Copyright (C) 2022  Antoine Herzog <info -!at-arobase=! les-cris -!dot-point=! com>.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
#******************************************************************************

# Launch the WGet retrieval of this web site.

# To keep the folder before the processing (that will need to cd to other folder).
SCRIPT_WEBSITE_COPIER_HOME_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

# For debug. Set it to "true" to activate the mode debug.
### SCRIPTCTXT_DEBUG="true"
SCRIPTCTXT_DEBUG=""
# For testing. Set it to "true" to activate the mode testing.
# Testing is, as an example, with retrieval of only one page, of the web site, to make the test quickly.
# Note : for WGet, the value is defined in the WGet launch script.
SCRIPTCTXT_TESTING_GEN="false"
### SCRIPTCTXT_TESTING_GEN="true"

# FOR ERASING FORMER RESULT AND OVERWRITE THE WEBSITE COPY.
# shall be "true", if not, will not erase and copy the web site inside a timestamped folder.
DEST_OUTPUT_OVERWRITE_FORMER_WEBSITE_COPY="false"
# Safety : If Overwrite is True, then this variable will be checked against "DEST_OUTPUT_FOLDER_ROOT", to make sure not to erase another folder ("rm -rf ..." !).
#DEST_OUTPUT_OVERWRITE_FORMER_WEBSITE_COPY_FOLDER_ROOT_FOR_CHECKING="/home/home_strghd/ant/Dev_Misc/SiteWeb/SiteWeb-Archives/SiteWeb-Archives-sites-aspires-220122-paquet-de-CRIs/smo-Essais-sites-internet/Smo_614_Gers-Numerique_com_TOTOTITITOTO"
DEST_OUTPUT_FOLDER_ADD_DATETIMESTAMP="true"
# The predefined date and timestamp, to use. 
# If the value is null or not defined, a value is calculated.
# If the script is called by the script for the build of Paquet-de-CRIs, then the timestamps is yet defined.
if [ "x${THIS_PAQUET_DE_CRIS_TIMESTAMP_USUALFORMAT}" = "x" ]
then
	echo ""
	echo "#--------------------ERROR starting WGet process ----------------------"
	echo "ERROR : the variable THIS_PAQUET_DE_CRIS_TIMESTAMP_USUALFORMAT is not set. It should have been set by the calling script"
	echo ""
	echo "ERROR : the WGet process will end with exit code : -63."
	
	echo "#--------------------ERROR starting WGet process ----------------------"
	# DEST_OUTPUT_FOLDER_PREDEFINED_DATETIMESTAMP=`date +%Y-%m-%d_%Hh%Mm`				# Datestamp e.g 2002-09-21_19h26m
	exit -63;
else
	DEST_OUTPUT_FOLDER_PREDEFINED_DATETIMESTAMP="${THIS_PAQUET_DE_CRIS_TIMESTAMP_USUALFORMAT}"
	echo ""
	echo "DEST_OUTPUT_FOLDER_PREDEFINED_DATETIMESTAMP=${DEST_OUTPUT_FOLDER_PREDEFINED_DATETIMESTAMP}"
fi

# The WGet application is installed in the /usr folder.
# Httrack is installed with DNF (Yum), with the Fedora repository, or with pkg_add with the OpenBSD repository.
APPLICATION_WGET_HOME_DIR=/usr/bin
APPLICATION_WGET_EXEC_NAME=wget

WEB_SITE_NAME_USUAL="les-CRIs_com"
# Usually, this is the name of the WebSite copied with WGet in these tool script.
WEB_SITE_NAME_COPIED_USUAL="$WEB_SITE_NAME_USUAL-website-copied"
# For the Paquet-de-CRIs, this is the name. The timestamp is added in the general processing script.
# WEB_SITE_NAME_COPIED_USUAL="Paquet_de_CRIs_local"
# The predefined WEB_SITE_NAME_COPIED_USUAL, to use. 
# If the value is null or not defined, a value is calculated.
# If the script is called by the script for the build of Paquet-de-CRIs, then the timestamps is yet defined.
if [ "x${FOR_WGET_PROCESS_WEB_SITE_NAME_COPIED_USUAL_VALUE}" = "x" ]
then
	echo ""
	echo "#--------------------ERROR starting WGet process ----------------------"
	echo "ERROR : the variable FOR_WGET_PROCESS_WEB_SITE_NAME_COPIED_USUAL_VALUE is not set. It should have been set by the calling script"
	echo ""
	echo "ERROR : the WGet process will end with exit code : -79."
	
	echo "#--------------------ERROR starting WGet process ----------------------"
	exit -79;
else
	WEB_SITE_NAME_COPIED_USUAL="${FOR_WGET_PROCESS_WEB_SITE_NAME_COPIED_USUAL_VALUE}"
	echo ""
	echo "WEB_SITE_NAME_COPIED_USUAL=${WEB_SITE_NAME_COPIED_USUAL}"
fi


# The url.
WEB_SITE_URL_BASE="les-cris.com"
# Example of url, for getting only one page. For debug and testing.
if [ "x${SCRIPTCTXT_TESTING_GEN}" = "xtrue" ]
then
	WEB_SITE_URL_BASE="les-cris.com/pages-110-au-fil-du-temps/les-ponctuels-2021/cri-au-fil-du-temps-210724-A-Liberte_Liberte_cherie_avec_l_Electronic_Frontier_Fondation.php"
	echo ""
	echo "WARN: Testing mode gen."
	echo "Only one page is retrieved from the Les-cris.com website. This makes the test quick."
	echo ""
	echo "WEB_SITE_URL_BASE=${WEB_SITE_URL_BASE}"
	
	echo "End of WARN: Testing mode gen."
fi

WEB_SITE_URL_GEN="https://www.$WEB_SITE_URL_BASE"
# For the DEST_OUTPUT_FOLDER_ROOT
if [ "x${FOR_WGET_PROCESS_DEST_OUTPUT_FOLDER_ROOT_BASE_VALUE}" = "x" ]
then
	echo ""
	echo "#--------------------ERROR starting WGet process ----------------------"
	echo "ERROR : the variable FOR_WGET_PROCESS_DEST_OUTPUT_FOLDER_ROOT_BASE_VALUE is not set. It should have been set by the calling script"
	echo ""
	echo "ERROR : the WGet process will end with exit code : -49."
	echo "#--------------------ERROR starting WGet process ----------------------"
	
	exit -49
else
	# DEST_OUTPUT_FOLDER_ROOT="/...xxx...../paquet-de-CRIs/${WEB_SITE_NAME_USUAL}"
	DEST_OUTPUT_FOLDER_ROOT="${FOR_WGET_PROCESS_DEST_OUTPUT_FOLDER_ROOT_BASE_VALUE}"
	echo "DEST_OUTPUT_FOLDER_ROOT=${DEST_OUTPUT_FOLDER_ROOT}"
fi

# For post-processing for cleaning filename extension dot_php_dot_html
# For replacing the .php.html extension, made by WGet, into .html.
# This is done for the filenames, and for the links inside the html pages.
# Usually, for mirroring a website, this is done, and for archive, this is not done.
# For www.les-cris.com, this postprocessing is not done : the files keeps the .php.html extension/
# This allow to distiguish clearly a PaquetXXXXXXXX, with keeping the [file-name.php] in the html anchors links.
# This way, the filenames are similar to the one from the website, that is [file-name.php], and can be searched the same way.
POSTPROCESSING_IS_DO_CLEAN_DOT_PHP_DOT_HTML_TO_DOT_HTML="false"

DEST_OUTPUT_FOLDER_NAME="${WEB_SITE_NAME_COPIED_USUAL}"

# For logging and general need of a new line char.
NEW_LINE_CHAR='
'

# The user and groups, for the WGet process.
# Use the current user and group that launched the script.
# "${UG_WGETPROCESS_USER_NAME}:${UG_WGETPROCESS_GROUP_NAME}"
UG_WGETPROCESS_USER_NAME="$(whoami)"
UG_WGETPROCESS_GROUP_NAME="$(id -g -n)"

# The grep shell command.
GREP_TOOL_SHELL_COMMAND="/usr/bin/grep"

# Prepare the output folder
DEST_OUTPUT_FOLDER_FULLPATH_GEN="${DEST_OUTPUT_FOLDER_ROOT}/${DEST_OUTPUT_FOLDER_NAME}"
if [ "x$DEST_OUTPUT_FOLDER_ADD_DATETIMESTAMP" = "xtrue" ]
then
	# The predefined date and timestamp, to use. 
	# If the value is null or not defined, a value is calculated.
	# If the script is called by the script for the build of Paquet-de-CRIs, then the timestamps is yet defined.
	if [ "x${DEST_OUTPUT_FOLDER_PREDEFINED_DATETIMESTAMP}" = "x" ]
	then
		DEST_OUTPUT_FOLDER_DATETIMESTAMP_VALUE=`date +%Y-%m-%d_%Hh%Mm`				# Datestamp e.g 2002-09-21_19h26m
	else
		DEST_OUTPUT_FOLDER_DATETIMESTAMP_VALUE="${DEST_OUTPUT_FOLDER_PREDEFINED_DATETIMESTAMP}"
	fi
	DEST_OUTPUT_FOLDER_FULLPATH_GEN="${DEST_OUTPUT_FOLDER_FULLPATH_GEN}-${DEST_OUTPUT_FOLDER_DATETIMESTAMP_VALUE}"
fi

# The Log files - Will log in a different folder.
# The files names are : "hts-log.txt", and "hts-err.txt" (mandatory, imposed by WGet).
# The file name GLOBAL_END_OF_PROCESS_ERRORS_FILE is to tell if there was some error in the global process.
# This file is because WGet does not exit with error, if there was some pages that was not retrieved, or in error when trying to get them. 
LOG_WGET_ROOTFOLDER_FULLPATH="${DEST_OUTPUT_FOLDER_FULLPATH_GEN}-logs"
LOG_WGET_GEN_FULLPATH_AND_FILE_NAME="${LOG_WGET_ROOTFOLDER_FULLPATH}/wget-mirror-website-log.txt"		# Logfile Name
LOG_WGET_ERR_FULLPATH_AND_FILE_NAME="${LOG_WGET_ROOTFOLDER_FULLPATH}/wget-mirror-website-err.txt"		# Logfile Name
LOG_WGET_GLOBAL_END_OF_PROCESS_ERRORS_FILE_NAME="wget-log-for-global-end-of-process-errors.log"		# Logfile Name

# For options, see the Documentation : https://www.man7.org/linux/man-pages/man1/wget.1.html

# Options TRACK_SCOPE : TRACK_SCOPE options
APP_OPTION_TRACK_SCOPE=""
# not going beyond the bounds of all the files in the WEB_SITE_URL_BASE domain
### APP_OPTION_TRACK_SCOPE="$APP_OPTION_TRACK_SCOPE +*.$WEB_SITE_URL_BASE/* +*my-pages-to-retrieve* "
APP_OPTION_TRACK_SCOPE="$APP_OPTION_TRACK_SCOPE ${WEB_SITE_URL_BASE}"

# Options 01 : Action options
APP_OPTION_01=""
# Option : w *mirror web sites (--mirror)
### APP_OPTION_01="$APP_OPTION_01 --mirror"
# Option : -r --recursive Turn on recursive retrieving. The default maximum depth is 5.
APP_OPTION_01="$APP_OPTION_01 --recursive"
# Option : -l depth --level=depth Set the maximum number of subdirectories that Wget will recurse into to depth. 
#			In order to prevent one from accidentally downloading very large websites when using recursion this is limited to a depth of 5 by default, i.e., 
#			it will traverse at most 5 directories deep starting from the provided URL. Set -l 0 or -l inf for infinite recursion depth.
APP_OPTION_01="$APP_OPTION_01 --level=inf"
# Option : -k --convert-links After the download is complete, convert the links in the document to make them suitable for local viewing.
APP_OPTION_01="$APP_OPTION_01 --convert-links"
# Option : -K --backup-converted When converting a file, back up the original version with a .orig suffix.  Affects the behavior of -N.
### APP_OPTION_01="$APP_OPTION_01 --backup-converted"
# Option : -N --timestamping Turn on time-stamping.
APP_OPTION_01="$APP_OPTION_01 --timestamping"
# Option : -m --mirror Turn on options suitable for mirroring.  This option turns on recursion and time-stamping, sets infinite recursion depth and keeps FTP directory listings.  
#			It is currently equivalent to -r -N -l inf --no-remove-listing.
### APP_OPTION_01="$APP_OPTION_01 --mirror"
# Option : -p --page-requisites This option causes Wget to download all the files that are necessary to properly display a given HTML page.  This includes such things as inlined images, sounds, and referenced stylesheets.
### APP_OPTION_01="$APP_OPTION_01 --page-requisites"
# Option : Recursive Accept/Reject Options
#	       -A acclist --accept acclist
#	       -R rejlist --reject rejlist 
#			Specify comma-separated lists of file name suffixes or patterns to accept or reject. Note that if any of the wildcard characters, *, ?, [ or ], appear in an element of acclist or rejlist, it will be treated as a pattern, rather than a suffix.  In this case, you have to enclose the pattern into quotes to prevent your shell from expanding it, like in -A "*.mp3" or -A '*.mp3'.
### APP_OPTION_01="$APP_OPTION_01 --accept acclist"
# Option : --accept-regex urlregex  --reject-regex urlregex
#           Specify a regular expression to accept or reject the complete URL.
### APP_OPTION_01="$APP_OPTION_01 --"
# Option : -np --no-parent Do not ever ascend to the parent directory when retrieving recursively.  This is a useful option, since it guarantees that only the files below a certain hierarchy will be downloaded.
### APP_OPTION_01="$APP_OPTION_01 --no-parent"

# See also :
# -D domain-list --domains=domain-list
#           Set domains to be followed.  domain-list is a comma-separated
#           list of domains.  Note that it does not turn on -H.
# --exclude-domains domain-list
#           Specify the domains that are not to be followed.
# See the man page, for other options.


# Options 02 : Limits options
APP_OPTION_02=""
# Option : --limit-rate=amount  Limit the download speed to amount bytes per second.  
#			Amount may be expressed in bytes, kilobytes with the k suffix, or megabytes with the m suffix.  
#           For example, --limit-rate=20k will limit the retrieval rate to 20KB/s. 
#			See man file for details.
APP_OPTION_02="$APP_OPTION_02 --limit-rate=1m"
# Option : -w seconds --wait=seconds Wait the specified number of seconds between the retrievals.
#           Use of this option is recommended, as it lightens the server load by making the requests less frequent.
APP_OPTION_02="$APP_OPTION_02 --wait=1s"
# Option : --random-wait Some web sites may perform log analysis to identify retrieval
#           programs such as Wget by looking for statistically significant similarities in the time between requests.
APP_OPTION_02="$APP_OPTION_02 --random-wait"
# Option : -Q quota --quota=quota Specify download quota for automatic retrievals.
# 			The value can be specified in bytes (default), kilobytes (with k suffix), or megabytes (with m suffix).
APP_OPTION_02="$APP_OPTION_02 --quota=500m"
# Option : 
# APP_OPTION_02="$APP_OPTION_02 --=1m"


# Options 04 : Flow control
APP_OPTION_04=""
# Option : -T seconds --timeout=seconds Set the network timeout to seconds seconds.  This is equivalent to specifying --dns-timeout, --connect-timeout,  and --read-timeout, all at the same time.
APP_OPTION_04="$APP_OPTION_04 --timeout=30"
# Option : -t number --tries=number Set number of tries to number. Specify 0 or inf for infinite retrying.  The default is to retry 20 times, with the exception of fatal errors like "connection refused" or "not found" (404), which are not retried.
APP_OPTION_04="$APP_OPTION_04 --tries=10"

# Options 05 : View control
APP_OPTION_05=""
# Option : --show-progress Force wget to display the progress bar in any verbosity.
APP_OPTION_05="$APP_OPTION_05 --show-progress"
# Option : --progress=type Select the type of the progress indicator you wish to use.
#           Legal indicators are "dot" and "bar". See man file for details.
# APP_OPTION_05="$APP_OPTION_05 --progress=type=dot"


# Options 06 : Links options
APP_OPTION_06=""
# Option : %P *extended parsing, attempt to parse all links, even in unknown tags or Javascript (%P0 don't use) (--extended-parsing[=N])
### APP_OPTION_06="$APP_OPTION_06 --extended-parsing=0"
# Option : n  get non-html files 'near' an html file (ex: an image located outside) (--near)
### APP_OPTION_06="$APP_OPTION_06 --near"
# Option : t  test all URLs (even forbidden ones) (--test)
### APP_OPTION_06="$APP_OPTION_06 --test"

# Options Build : Build options
APP_OPTION_BUILD=""
# Option : -P prefix --directory-prefix=prefix Set directory prefix to prefix.  
#			The directory prefix is the directory where all other files and subdirectories will be saved to, i.e. the top of the retrieval tree.  The default is . (the current directory).
APP_OPTION_BUILD="$APP_OPTION_BUILD --directory-prefix=${DEST_OUTPUT_FOLDER_FULLPATH_GEN}"
# Option : -E --adjust-extension If a file of type application/xhtml+xml or text/html is downloaded 
#			and the URL does not end with the regexp \.[Hh][Tt][Mm][Ll]?, this option will cause the
#			suffix .html to be appended to the local filename.
APP_OPTION_BUILD="$APP_OPTION_BUILD --adjust-extension"
# Option : 
### APP_OPTION_BUILD="$APP_OPTION_BUILD "
# Option : 
### APP_OPTION_BUILD="$APP_OPTION_BUILD "

# Options SPIDER : Spider options
APP_OPTION_SPIDER=""
# Option : 
### APP_OPTION_SPIDER="$APP_OPTION_SPIDER "
# Option : 
### APP_OPTION_SPIDER="$APP_OPTION_SPIDER "

# Options 07 : Browser ID
APP_OPTION_07=""
# Option : 
### APP_OPTION_07="$APP_OPTION_07 "
# Option : 
### APP_OPTION_07="$APP_OPTION_07 "
# Option : 
### APP_OPTION_07="$APP_OPTION_07 "

# Options 08 : Log, index, cache
APP_OPTION_08=""
# Option : -o logfile --output-file=logfile Log all messages to logfile. The messages are normally reported to standard error.
APP_OPTION_08="$APP_OPTION_08 --output-file=${LOG_WGET_GEN_FULLPATH_AND_FILE_NAME}"
# Option :  -q --quiet Turn off Wget's output.
# APP_OPTION_08="$APP_OPTION_08 --quiet"
# Option : --rejected-log=logfile Logs all URL rejections to logfile as comma separated values.
#           The values include the reason of rejection, the URL and the parent URL it was found in.
APP_OPTION_08="$APP_OPTION_08 --rejected-log=${LOG_WGET_ERR_FULLPATH_AND_FILE_NAME}"


# Options last :
APP_OPTION_10=""
# Option : 
# APP_OPTION_10="$APP_OPTION_10 -"

if [ "x${SCRIPTCTXT_DEBUG}" = "xtrue" ]
then
	# For debug.
echo 
echo ========================================================================================
echo 
echo Configuration of WGet :
echo
echo "APPLICATION_WGET_HOME_DIR=$APPLICATION_WGET_HOME_DIR"
echo "APPLICATION_WGET_EXEC_NAME=$APPLICATION_WGET_EXEC_NAME"
echo "DEST_OUTPUT_OVERWRITE_FORMER_WEBSITE_COPY=$DEST_OUTPUT_OVERWRITE_FORMER_WEBSITE_COPY"
echo "DEST_OUTPUT_FOLDER_ADD_DATETIMESTAMP=$DEST_OUTPUT_FOLDER_ADD_DATETIMESTAMP"
echo "=$"
echo "=$"
echo "=$"
echo
echo
echo ========================================================================================
echo

fi

# Launch the WGet command in the general script.
. "$SCRIPT_WEBSITE_COPIER_HOME_DIR/wget-mirror-website-launch-010-Gen.sh"



