#!/bin/bash
# compress overlarge files of scanned documents, run with -h for help
# for similar but different approaches see https://askubuntu.com/questions/113544/how-can-i-reduce-the-file-size-of-a-scanned-pdf-file
# esp the answer that starts 'If you have a pdf with scanned images...'

VERSION="2.0.4 [01 Jan 2026]"

echolog() {
	# echo and/or put to log
	# example: echolog 0 "1 line message" "\nsubsequent lines"
	# param1 is the (quasi-exit) code, param2 is the main message, param3 is any additional info (which is never put to log)
	# Specifically, param1 values have the following effects:
	#   param1         echo                log
	#   ------ -------------------- ------------------
	#     0    stdout(unless quiet) yes*
	#    >0    stderr               yes*
	#    -1    no                   yes*
	#   <-1    no                   yes(unless quiet)*
	#
	# * with overrides:
	#     test mode               - do not log
	#     running from a terminal - do not log
	if [[ -n $2$3 ]]; then
		if [[ $1 -eq 0 ]]; then
			[[ -z $QUIET ]] && echo -e "$2$3"
		else
			echo -e "$2$3" >&2
		fi
		local ASCII_TEXT
		ASCII_TEXT="$(echo "$2"|LC_ALL=C tr -dc '\0-\177')"
		[[ -n $ASCII_TEXT && -z $TEST && ! -t 0 ]] && logger -t "${THIS}[$$]" -- "$ASCII_TEXT"
	fi
	return 0
}

quit() {
	# exit with code param 1, showing and logging exit message
	# optional short exit message as param 2 (shown and logged), optional longer message (e.g. multiline) as param 3 (shown, not logged)
	# example: quit 1 "No valid signature" "\nyou need to add a valid signature here"
	echolog "$1" "$2" "$3"
	exit "$1"
}

THIS=$(basename "$0")
COLUMNS=$(stty size 2>/dev/null||echo 30000); COLUMNS=${COLUMNS##* }
set -o pipefail
GS_RESIZE=300
WHITE_THRESHOLD=".7"
BLACK_THRESHOLD=".5"
DEVOUT="/dev/stdout"
DTEMP="/tmp"

while getopts ":b:de:fhlmnp:qr:st:wy" optname; do
    case "$optname" in
		"b")	WHITE_THRESHOLD="$OPTARG";;
		"d")	DEBUG="y";;
		"e")  BLACK_THRESHOLD="$OPTARG";;
		"f")	FORCE_OVERWRITE="y";;
		"h")	HELP="y";;
		"l")	CHANGELOG="y";;
		"m")	NOREPEAT="y"; NOACTION="y";;
		"n")	NOREPEAT="y";;
		"p")	read -ra PARAMS <<<"$OPTARG";;
		"q")	QUIET="q"; DEVOUT="/dev/null";;
		"r")	RESIZE_AIM="$OPTARG";;
		"s")	STRIP_FINAL="y";;
		"t")	DTEMP="$OPTARG";;
		#"v")	VERBOSE="y";;
		"w")	COLUMNS=30000;;
		"y")	OVERWRITE="y";;
		"?")	echo "Unknown option $OPTARG">&2; exit 1;;
		":")	echo "No argument value for option $OPTARG">&2; exit 1;;
		*)		echo "Unknown error while processing options">&2; exit 1;; # Should not occur
    esac
done
shift $((OPTIND-1))
[[ -z $1 && -z $CHANGELOG ]] && HELP=y
[[ -n $HELP$CHANGELOG || -z $QUIET ]] && echo -e "\n$THIS v$VERSION - by Dominic (-h for help)\n${THIS//?/=}\n"
if [[ -n $HELP ]]; then
	echo -e "GNU/Linux program to compress an overly large pdf file to smaller \
monochrome. Typical use case is a file generated by scanning with \
unnecessarily high resolution and/or color. The objective \
is a file that although not of the same quality as the original, \
is still legible and much smaller (90%+ size reductions are common). \
This is achieved by conversion to 1-bit pixel mapping, so \
is typically most effective for dark text on a white background.

If destination path/file is unspecified $THIS will create a file in the same \
location as source, with -1 suffixed to the filename (before .pdf \
extension). If destination is a directory then the created \
file will have the same name as the source file. $THIS will also set the \
destination file to have the same ownership, group and modtime as the \
original file (but ownership can only be changed if running as superuser).

If a target maximum percentage size is set (-r option) and the initial \
compression attempt \
does not achieve at least this reduction, $THIS will retry, varying \
the GhostScript resize parameter (initial value $GS_RESIZE) until \
(hopefully) it can achieve something close to the specified value (but \
absolute accuracy of resizing is not to be expected).

If qpdf is present it is used (after GhostScript) to make a further c.10% \
filesize saving with no effect on quality; with -r this 10% is \
included in the specified target so \
the resulting file will have slightly better quality, not smaller \
size, than would have been achieved without qpdf. Downside: compressing with \
qpdf is relatively slow.

If exiftool is present it is used to update the 'Producer' metadata in \
the destination file to show that it was processed by $THIS, \
and when - see also options -m and -n.

With option -s, if the final page (not page 1) appears empty, it is stripped.

[Deprecated: specify additional options to be passed to the ImageMagick \
convert operation within $THIS by using -p; for instance, to hide a watermark \
try -p '-white-threshold 60%'.]

Usage: ./$THIS [options] source_path/filename [destination_path[/filename]]

Options   : -b value - set background white threshold (lower value turns more non-white background white; default value $WHITE_THRESHOLD)
            -e value - set foreground black threshold (higher value darkens text; default value $BLACK_THRESHOLD) (think 'e' for emphasis)
            -f - replace original file
            -h - show this help and exit
            -l - show changelog and exit
            -m - test (without making any changes) if file has previously been processed by \
$THIS (like -n below); exit code 1 if so or 0 if not (can be \
combined with -q for silent checking) - requires exiftool
            -n - skip making any changes if file has previously been processed by $THIS \
(i.e. if the 'Producer' metadata contains '$THIS') - requires exiftool
            -p 'param1 param2' - additional parameters for ImageMagick \
'convert' command [deprecated]
            -q - be quieter (1 line output if successful)
            -r num - target maximum percentage size vs original (so, 40 \
means at least 60% file size reduction)
            -s - remove any apparently-empty final page
            -t path - directory to use for temporary files, which can be \
large (default: /tmp)
            -y - overwrite destination file if it already exists

Dependencies: bash(4+) convert(ImageMagick) exiftool* gs(GhostScript) qpdf*
* not required but used for additional functionality if available

License: Copyright © 2026 Dominic Raferd. Licensed under the Apache License, \
Version 2.0 (the \"License\"); you may not use this file except in compliance \
with the License. You may obtain a copy of the License at \
https://www.apache.org/licenses/LICENSE-2.0. Unless required by applicable \
law or agreed to in writing, software distributed under the License is \
distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY \
KIND, either express or implied. See the License for the specific language \
governing permissions and limitations under the License.
"|fold -s -w"$COLUMNS"
fi
if [[ -n $CHANGELOG ]]; then
	[ -n "$HELP" ] && echo "Changelog:"
	echo -e "\
2.0 [13 May 2025] - add option -e to allow darkening of faint text (or lightening of excessively dark foreground)
1.9 [09 Nov 2023] - add option -s to strip any blank final page if option -s is selected (unless the file is a single blank page) [option change from 1.8]
1.8 [02 Nov 2023] - strip any blank final page (unless the file is a single blank page or option -s is selected)
1.7 [27 Oct 2023] - fix to pdf format identification
1.6 [19 Oct 2023] - minor fixes and shellcheck conformations
1.5 [15 Oct 2023] - add error logging unless running from a terminal (via echolog and quit functions)
1.4 [03 Jan 2023] - add -b (allow white background threshold to be varied)
1.3 [28 Apr 2022] - add -m (check if already processed without taking any other action)
1.2 [11 Apr 2022] - add -f (force overwrite) option
1.1 [08 Apr 2022] - improved output using GS 'Hybrid2' setting by Adam Lesser - kudos (see https://bugs.ghostscript.com/show_bug.cgi?id=694762)
1.0 [31 Mar 2022] - minor changes
0.9 [03 Mar 2022] - add -n option, set 'Producer' metadata (no effect if exiftool unavailable), remove -v option, make -q option non-silent
0.8 [13 Feb 2022] - modify -r option to be an 'aim for' percentage size reduction (by looping), remove -n (negate) option as irrelevant, add use of qpdf (if present) for additional c.10% size saving
0.7 [10 Feb 2022] - improved (and simplified) by using gs pngmonod device, align owner/group, permissions and modtime of created file with those of the original
0.6 [05 Aug 2020] - added more gs settings (kudos: Enno Nagel)
0.5 [16 Nov 2017] - negate by default (-n to non-negate), default resize 200 not 100, add -p option
0.4 [23 Apr 2017] - add -n (negative) option
0.3 [16 Oct 2016] - fix for temporary file deletion, make temporary filenames unique
0.2 [21 Sep 2016] - lots of tweaks
0.1 [20 Sep 2016] - initial version
"|fold -sw"$COLUMNS"
fi
[[ -n $HELP$CHANGELOG ]] && exit 0

# check that we have valid inputs
[[ -d $DTEMP ]] ||  quit 1 "Can't locate temporary directory '$DTEMP': aborting"
TMPPRE="$DTEMP/$THIS-$(id -u)"

[[ -n $DEBUG ]] && echo "PARAMS has ${#PARAMS[@]} elements: 0 is '${PARAMS[0]}', 1 is '${PARAMS[1]}'"
SOURCE="$1"
[[ -f $SOURCE ]] || quit 1 "Can't locate source file '$SOURCE': aborting"
[[ $(dd bs=1 count=120 skip=0 if="$SOURCE" 2>/dev/null|strings -s" "|sed 's/^\s*//') =~ "%PDF" ]] || quit 1 "Source file '$SOURCE' is not pdf, aborting"
SFILE="$(basename "$SOURCE")"
# some code below depends on the source file having a 3-character extension, so test for this
[[ ${SFILE:$((${#SFILE}-4))} =~ \....$ ]] || quit 1 "Source file '$SOURCE' must have a 3-character extension (e.g. .pdf), aborting"
SDIR="$(dirname "$(realpath "$SOURCE")")"
[[ -d $SDIR ]] || quit 1 "Can't locate source directory '$SDIR': aborting"
SSIZE=$(stat -c%s "$SOURCE")
[[ -n $DEBUG ]] && echo -e "Source is '$SOURCE'\nSDIR is '$SDIR'"
QPDF="$(command -v qpdf 2>/dev/null)"
[[ -n $QPDF ]] && PDF_GROW_FACTOR=120 || PDF_GROW_FACTOR=130
EXIFTOOL="$(command -v exiftool 2>/dev/null)"
if [[ -s $EXIFTOOL && -n $NOREPEAT ]]; then
	# check if we already processed this file
	PRODUCER="$("$EXIFTOOL" -m -q -S -Producer "$SOURCE" 2>/dev/null|cut -d" " -f1 --complement)"
	[[ "$PRODUCER" =~ $THIS ]] && { [[ -z $QUIET ]] && echo "$SOURCE already processed by $PRODUCER, rerun without -n to force reprocessing" >"$DEVOUT"; exit 1; }
elif [[ ! -s $EXIFTOOL && -z $QUIET ]]; then
	echo "Advisory: exiftool not found, operations using exiftool will be skipped" >"$DEVOUT"
fi
[[ -n $NOACTION ]] && { [[ -z $QUIET ]] && echo "No action taken"; exit 0; }

[[ -n $QUIET ]] && echo -n "$THIS: "
DEST=$2
if [[ -z $DEST ]]; then
	DEST="$SDIR/${SFILE:0:$((${#SFILE}-4))}-1${SFILE:$((${#SFILE}-4))}"
else
	[[ -n $FORCE_OVERWRITE ]] && quit 1 "Incompatible settings, -f with specified destination: aborting"
	if [[ -d $DEST ]]; then
		[[ ${DEST:$((${#DEST}-1))} == "/" ]] && DEST="$DEST$SFILE" || DEST="$DEST/$SFILE"
	fi
fi
DDIR="$(dirname "$DEST")"
[[ -d $DDIR ]] || quit 1 "Can't locate destination directory '$DDIR': aborting"
[[ -n $OVERWRITE || ! -f $DEST ]]  || quit 1 "Won't overwrite existing destination file '$DEST', specify -y to override, aborting"
# yes I realise this could be tricked and then overwriting would then be possible
[[ $DEST != "$SOURCE" ]] || quit 1 "Won't overwrite original file '$SOURCE', aborting"

[[ $GS_RESIZE =~ ^[1-9][0-9]*$ ]] || quit 1 "Resize percentage '$GS_RESIZE' is invalid, aborting"
GS="$(command -v gs 2>/dev/null)" || quit 1 "Can't locate gs - ghostscript packages must be installed e.g. apt-get install ghostscript"
CONVERT="$(command -v convert 2>/dev/null)" || quit 1 "Can't locate convert - imagemagick packages must be installed e.g. apt-get install imagemagick"

# we are good to go

# delete any left-over temporary files (from a previous failed or debug run)
find "$DTEMP" -maxdepth 1 -regextype posix-extended -regex "$TMPPRE-[0-9]*-[0-9][0-9][0-9]\.png" -delete || quit 1 "Unable to delete pre-existing temporary files, aborting"

[[ -z $QUIET && -n $RESIZE_AIM ]] && echo "Aiming for max% of original size: ${RESIZE_AIM}%"
[[ -z $QUIET ]] && echo -n "Extracting pages from $SOURCE: "
while true; do
	(( RESIZE_LOOP++ )) || true
	# see https://bugs.ghostscript.com/show_bug.cgi?id=694762
	"$GS" -q -dSAFER -dNOPAUSE -dBATCH -dUseCropBox -sOutputFile="$TMPPRE-$$-%03d.png" -r"$GS_RESIZE" -dDITHER=300 -sDEVICE=pngmonod -Ilib stocht.ps -c "{ dup $BLACK_THRESHOLD lt { pop 0 } if dup $WHITE_THRESHOLD gt { pop 1 } if } settransfer" -f "$SOURCE" >/dev/null 2>&1 || quit 1 "gs gave error $? when processing '$SOURCE', aborting"
	# previous (less good) conversion approaches, kept here for reference:
	#  v0.7-v1.0: "$GS" -r$GS_RESIZE -dNOPAUSE -dBATCH -dSAFER -sDEVICE=pngmonod -dDownScaleFactor=2 -sOutputFile="$TMPPRE-$$-%03d.png" "$SOURCE" >/dev/null 2>&1 || { echo "$THIS: gs gave error $?, aborting" >&2; exit 1; }
	#  v0.6-v0.6: "$GS" -dNOPAUSE -dBATCH -dSAFER -dPrinted=false -dPDFSETTINGS=/default -dCompatibilityLevel=1.4 -dDetectDuplicateImages=true -dDownsampleColorImages=true -dEmbedAllFonts=true -dSubsetFonts=true -dCompressFonts=true -dAutoRotatePages=/None -sDEVICE=pngalpha -sOutputFile="$TMPPRE-$$-%03d.png" "$SOURCE" >/dev/null 2>&1 || { echo "$THIS: gs gave error $?, aborting" >&2; exit 1; }
	[[ -z $QUIET ]] && echo -n "OK"
	if [[ -n $RESIZE_AIM && $RESIZE_LOOP -lt 6 ]]; then
		ISIZE=$(du -bc "$TMPPRE-$$-"*.png|tail -n1|cut -f1)
		PCT=$(( 100*ISIZE/SSIZE ))
		PCT_FINAL_EST=$(( PDF_GROW_FACTOR*ISIZE/SSIZE ))
		[[ -z $QUIET ]] && echo -en "\n  Intermediate file size(s) (based on resize ${GS_RESIZE}%): $ISIZE (${PCT}% of original)"
		[[ $PCT_FINAL_EST -le $RESIZE_AIM ]] && break
		NEW_GS_RESIZE=$(echo "$GS_RESIZE $((PCT_FINAL_EST+5)) $RESIZE_AIM"|awk '{print int($1/sqrt($2/$3))}')
		# ensure we do not get stuck in an endless loop
		[[ $NEW_GS_RESIZE -ge $GS_RESIZE ]] && NEW_GS_RESIZE=$((GS_RESIZE-2))
		GS_RESIZE=$NEW_GS_RESIZE
		[[ -z $QUIET ]] && echo -en "\n  Trying again with resize ${GS_RESIZE}%: "
	else
		break
	fi
done

if [[ -n $STRIP_FINAL ]]; then
	# check if we should strip the last blank page
	# shellcheck disable=SC2012
	LAST_PAGE="$(ls -1r "$TMPPRE-$$-"*.png|head -n1)"
	if [[ ! "$LAST_PAGE" =~ 001\.png$ ]]; then # don't strip the only page
		# it is 2 color, look at the mean color and the standard deviation, overwhelmingly white with minimal st dev indicates blank [25 Apr 2025]
		read -r MEAN STD_DEV < <(identify -verbose "$LAST_PAGE"|grep -E "^\s+(mean|standard deviation):"|awk '{print int($(NF-1)*1000)}')
		SIZE=$(stat -c%s "$LAST_PAGE")
		[[ -n $DEBUG ]] && echo -e "\n$LAST_PAGE mean: $MEAN stddev: $STD_DEV size: $SIZE bytes"
		# the mean and std_dev settings here are very cautious i.e. >=99.9% white and <=0.1% standard dev, might need tweaking
		[[ $MEAN -ge 999 && $STD_DEV -le 1 || $SIZE -le 3072 ]] && rm -- "$LAST_PAGE" && [[ -n $DEBUG ]] && echo " - removed" # also, if <=3k assume it is blank
	fi
fi

#  combine png files to create a new pdf file
[[ -z $QUIET ]] && echo -en "\nBuilding $DEST: "
# specifying -adjoin and/or -colors 2 has no effect
# shellcheck disable=SC2068,SC2046
if "$CONVERT" $(ls "$TMPPRE-$$-"*.png) ${PARAMS[@]} "$DEST"; then
	echo -n "OK" >"$DEVOUT"
else
	quit 1 "convert gave error $? when processing intermediate files from '$SOURCE', aborting"
fi
# if possible, set the 'Producer' tag/metadata so we know we already processed this file
if [[ -s $EXIFTOOL ]]; then
	[[ -z $QUIET ]] && echo -en "\nAdding Producer metadata to $DEST: "
	"$EXIFTOOL" -m -q -z -overwrite_original_in_place -Producer="$THIS v$VERSION on $(date +%F)" "$DEST" 2>/dev/null && echo -n "OK" >"$DEVOUT" || echo -n "FAIL" >"$DEVOUT"
fi
if [[ -n $QPDF ]]; then
	# this should save about 10% size on the destination file, with no effect on quality [13 Feb 2022]
	[[ -z $QUIET ]] && echo -en "\nRecompressing: "
	"$QPDF" --recompress-flate --compression-level=9 --object-streams=generate --replace-input --warning-exit-0 "$DEST" 2>/dev/null; RESULT=$?
	[[ -f ${DEST}.~qpdf-orig ]] && rm -- "${DEST}.~qpdf-orig" # qpdf may create this backup file but we do not need it
	if [[ -z $QUIET ]]; then
		[[ $RESULT == 0 ]] && echo "OK" || echo "FAIL"
	fi
	[[ $RESULT -ne 0 ]] && quit 1 "qpdf gave error $RESULT when compressing '$SOURCE', check file '$DEST', aborting"
fi
# align ownership, permissions and mod time of new file to that of old
if [[ $(id -u) -eq 0 ]]; then
	chown --reference="$SOURCE" "$DEST"
else # non-root users cannot change ownership, but we can at least get the group same as original file
	chgrp --reference="$SOURCE" "$DEST"
fi
chmod --reference="$SOURCE" "$DEST"
touch --reference="$SOURCE" "$DEST"
# tidy up by removing .png files
if [[ -z $DEBUG ]]; then
	rm -- "$TMPPRE-$$-"*.png
else
	echo "Debug mode, retained:"
	# shellcheck disable=SC2012
	ls -l "$TMPPRE-$$-"*.png|sed 's/^/  /'
fi

[[ -n $FORCE_OVERWRITE ]] && { mv "$DEST" "$SOURCE"; DEST="$SOURCE"; }

# all done, tell the world about it
DSIZE=$(stat -c%s "$DEST")
echo -e "$DEST [$((DSIZE/1024))K] from $SOURCE [$(( SSIZE/1024 ))K], compression $(( (SSIZE-DSIZE)/1024 ))K [$(( 100-100*DSIZE/SSIZE ))%]$ISIZE_CRUSHSAVE"
exit 0
