#!/bin/bash
VERSION="3.0.2 [01 Jan 2026]"
THIS=$(basename "$0")
COLUMNS=$(stty size 2>/dev/null||echo 30000); COLUMNS=${COLUMNS##* }
while getopts ":ac:dfFghi:jlnquwx" optname; do
	case "$optname" in
		"a")	SHOWHIDDENONLY="y"; AUTOTAG="y"; QUIET="y";;
		"c")	COOKIES="$COOKIES --keep-session-cookies --load-cookies $OPTARG --save-cookies $OPTARG";;
		"d")	SHOWHIDDENONLY="y"; DATATAG="y"; QUIET="y";;
		"f")	FORMACTION="y"; QUIET=y;;
		"F")	SHOWHIDDENONLY="y"; QUIET="y";;
		"g")	SHOWHIDDENONLY="y"; FORWGET="y"; QUIET="y";;
		"h")	HELP="y";;
		"i")	FORMID="$OPTARG";;
		"l")	CHANGELOG="y";;
		"j")	SHOWJAVASCRIPTTAGS="y";;
		"n")	NO_HEADER=" -n";;
		"q")	QUIET="y"; AUTOTAG="y";;
		"u")	DEBUG="y";;
		"w")	COLUMNS=30000;; #suppress line-breaking
		"x")	COOKIES="$COOKIES --no-check-certificate";;
		"?")	echo "Unknown option $OPTARG"; exit 1;;
		":")	echo "No argument value for option $OPTARG"; exit 1;;
		*)	# Should not occur
			echo "Unknown error while processing options"; exit 1;;
	esac
done
shift $((OPTIND-1))
[[ -z $1$HELP$CHANGELOG ]] && HELP="y"
[[ -z $QUIET$NO_HEADER ]] && { echo -e "\n$THIS v$VERSION by Dominic (try -h for help)\n${THIS//?/=}\n"; }
if [[ $HELP = y ]]; then
	echo -e "Linux/Cygwin program to extract form tags from html. \
Handy for hacking website form responses; for instance, can be called from \
another script with -a option to get 'magic' hidden form responses for \
resubmission using curl or wget.

Usage       :  $THIS [options] file_or_web_address

Options     :  -a  - like -d / -F but determine automatically which is \
appropriate (-d or -F) (only for curl not wget i.e. not with -g)
               -c \"pathfilename\" - use and save any cookies in \"pathfilename\"
               -d  - return only hidden and submit tags ready for curl \
including -d for content type application/x-www-form-urlencoded
               -f  - return only the action (i.e. page to be replied to)
               -F  - return only hidden and submit tags ready for curl \
including -F for content type multipart/form-data
               -g  - return only hidden and submit tags ready for wget, \
including --post-data=
               -h  - show this help and exit
               -i \"text\" - provide unique identifying text within the form tag \
to identify the correct form on a page with multiple forms
               -j  - also show javascript tags
               -l  - show changelog and exit
							 -n  - no header
               -q  - return only non-hidden tags in -a format
               -u  - debug mode (implementation may vary)
               -x  - skip remote identity verification (--no-check-certificate)

Example
    # ... shell (e.g. bash) code snippet example:
    # download a remote page containing a single login form, keep/re-use any cookies
    curl -b /tmp/cookies -c /tmp/cookies -o /tmp/page.htm https&#58;//myloginpage.com
    # use $THIS to extract all hidden tags on the form
    mapfile -t HIDDENTAGS < <(./$THIS -a /tmp/page.htm)
    # add your own specific data
    LOGINTAGS=\"-F username=houdini -F password=LetMeIn\"
    # login using any cookies, hidden tags and your specific data
    curl -b /tmp/cookies -c /tmp/cookies \${HIDDENTAGS[@]} \$LOGINTAGS -o /tmp/page.htm https&#58://myloginpage.com
    # logged in page saved at /tmp/page.htm, enjoy...
Dependencies:  awk sed wget (wget is only required if retrieving from internet)

License: Copyright © 2026 Dominic Raferd. Licensed under the Apache License, \
Version 2.0 (the \"License\"); you may not use this file except in compliance \
with the License. You may obtain a copy of the License at \
https://www.apache.org/licenses/LICENSE-2.0. Unless required by applicable \
law or agreed to in writing, software distributed under the License is \
distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY \
KIND, either express or implied. See the License for the specific language \
governing permissions and limitations under the License.
"|fold -sw "$COLUMNS"
fi
if [[ -n $CHANGELOG ]]; then
	[[ -n $HELP ]] && echo "Changelog:"
	echo -e "\
3.0 [17 Oct 2024]: change -q option to return non-hidden tags in -a format
2.9 [10 Oct 2024]: add -n option, make shellcheck-compatible
2.8 [26 May 2020]: add -f (action) option
2.7 [27 Mar 2018]: bugfix when there are spaces in values
2.6 [05 Jun 2017]: bugfix when there are spaces and quotes in values
2.5 [18 May 2017]: add -u (debug) option
2.4 [18 May 2016]: bugfix -i option
2.3 [15 Apr 2016]: use existing cookie file as well as saving there
2.2 [06 Nov 2015]: bugfix hidden/submit tag extraction
2.1 [28 Oct 2015]: more sophisticated hidden/submit tag extraction, better \
tidying up
2.0 [24 Aug 2015]: bugfixes
1.9 [08 Oct 2014]: bugfix for -a option, add example to help
1.8 [06 Oct 2014]: add submit to hidden tag responses
1.7 [02 Apr 2014]: add -a and -g options
1.6 [13 Mar 2014]: bugfix
1.5 [03 Feb 2014]: bugfix
1.4 [23 Jan 2014]: work with forms which don't use quotes
1.3 [02 Nov 2013]: first public release
1.2 [03 May 2012]: operate on remote web page as well as local files
1.1 [7 Feb 2011]
"|fold -sw"$COLUMNS"
fi
[[ -n $HELP$CHANGELOG ]] && exit
# see https://docs.python.org/3/library/tempfile.html - this is similar to the
# way Python's tempfile is meant to work, but the order is slightly different
if [[ -z $TEMP ]]; then
	# shellcheck disable=SC2153
	for TEMP in "$TMPDIR" "$TMP" /tmp /var/tmp /usr/tmp $PWD; do
		[[ -d $TEMP ]] && break
	done
fi
if [[ ${1:0:4} == "http" ]]; then
	PAGEFILE="$TEMP/$THIS-$USER-$$.htm"
	[[ -n $DEBUG ]] && echo "doing: wget -q $COOKIES -O \"$PAGEFILE\" \"$1\""
	# shellcheck disable=SC2086
	wget -q $COOKIES -O "$PAGEFILE" "$1" || { echo "Unable to retrieve $PAGEFILE (error $?), aborting">&2; exit 1; }
else
	PAGEFILE="$1"
	[[ -f $PAGEFILE ]] || { echo "Unable to locate $PAGEFILE, aborting">&2; exit 1; }
fi

[[ -z $QUIET ]] && echo -e "Form tags:"
sed 's/\r//;s/>\(.\)/>\n\1/g' "$PAGEFILE"|awk -F\" '{if ((NF % 2)==0) {printf "%s",$0} else {print $0}}'|sed -n '/<form/,/<\/form>/{/<small\|<\/small>\|<div\|<\/div>\|<a\|<\/a\|<br\|<p\|<\/p\|<li\|<\/li\|<h[0-5]\|<\/h[0-5]\|<label\|<\/label\|<ul\|<\/ul\|<tr\|<\/tr\|<th\|<\/th\|<td\|<\/td\|<table\|<\/table\|<img\|<span\|<\/span\|<hr/d;/^\s*$/d;s/\t*//g;s/^\s*//p}'>"/tmp/$THIS.tmp"
TMP2=$(mktemp)
if [[ -n $FORMID ]]; then
	cp "/tmp/$THIS.tmp" "$TMP2"
	sed -n "/^<form.*$FORMID/,/^<\/form>/{/^<\/form>/{p;q};p}" "$TMP2">"/tmp/$THIS.tmp"
fi
if [[ -n $FORMACTION ]]; then
	sed -n '/^<form /,/^<\/form>/{s/.* action="\?\([^"> ]*\).*/\1/p}' "/tmp/$THIS.tmp"
else
	[[ -z $DATATAG ]] && CURLOPTION="-F " || CURLOPTION="-d "
	if [[ -n $AUTOTAG ]]; then
		# shellcheck disable=SC2143
		[[ -n "$(grep -o "action.*multipart/form-data" "/tmp/$THIS.tmp")" || -n "$(grep -o "multipart/form-data.*action" "/tmp/$THIS.tmp")" ]] && CURLOPTION="-F " || CURLOPTION="-d "
	fi
	[[ -n $FORWGET ]] && CURLOPTION="--post-data="
	[[ -n $SHOWHIDDENONLY$QUIET ]] && cp "/tmp/$THIS.tmp" "$TMP2" && awk -v CURLOPTION="$CURLOPTION" -v DEBUG="$DEBUG" -v SHOWHIDDENONLY="$SHOWHIDDENONLY" '
	# awk script begins
	# use FPAT so each tag and value (aka tag_assignment) is in one field even with spaces in the value
	BEGIN {FPAT = "([^ \"]+)|([^ ]*=\"[^\"]+\")"} 
	{
		if (DEBUG!="") {
			print "Line "NR": "$0
			for (FIELD=1; FIELD<=NF; FIELD++) {print "  Pre-processed field "FIELD": "$FIELD}
		}
		IGNORE="y"	  # ignore this line unless we decide later we need it
		NAME=""; VALUE=""
		for (FIELD=1; FIELD<=NF; FIELD++) {
			if (DEBUG!="") print "  Field "FIELD": "$FIELD
			EQUALS=index($FIELD,"="); PARAMETER=substr($FIELD,1,EQUALS-1); ASSIGNMENT=substr($FIELD,EQUALS+1)
	
			# remove any start-of-tag or end of tag
			if (substr(PARAMETER,1,1)=="<") PARAMETER=substr(PARAMETER,2) 
			if (substr(ASSIGNMENT,length(ASSIGNMENT))==">") ASSIGNMENT=substr(ASSIGNMENT,1,length(ASSIGNMENT)-1) 
			# remove any quotes in ASSIGNMENT
			gsub("\"","",ASSIGNMENT)
	
			if (PARAMETER=="type") {
				# we only need lines with tag type=hidden or submit
				if (SHOWHIDDENONLY=="y" && (ASSIGNMENT!="submit" && ASSIGNMENT!="hidden")) break
				if (SHOWHIDDENONLY!="y" && (ASSIGNMENT=="submit" || ASSIGNMENT=="hidden")) break
				IGNORE="n"	# this is a line that we need
			}
			if (PARAMETER=="value") VALUE=ASSIGNMENT
			if (PARAMETER=="name") NAME=ASSIGNMENT
		}
		if (IGNORE=="n" && NAME!="") {
			# putting quotes round the value breaks it, tho Im not sure why (08 Jun 2017)
			print CURLOPTION NAME "=" gensub(/ /,"%20","g",VALUE)
		}
	}
	# awk script ends
	' "$TMP2" >"/tmp/$THIS.tmp"
	cat "/tmp/$THIS.tmp"
fi
if [ -n "$SHOWJAVASCRIPTTAGS" ]; then
	echo -e "\nJavascript tags:"
	sed 's/\r//;s/>\(.\)/>\n\1/g' "$PAGEFILE"|awk -F\" '{if ((NF % 2)==0) {printf "%s",$0} else {print $0}}'|sed -n '/<script/,/<\/script>/{/<div\|<\/div>\|<a\|<\/a\|<br\|<p\|<\/p\|<li\|<\/li\|<ul\|<\/ul\|<img\|<span\|<\/span\|<hr/d;/^\s*$/d;p}'
fi
if [[ -z $DEBUG ]]; then
	rm -- "$TMP2" "/tmp/$THIS.tmp"
	[[ ${1:0:4} == "http" ]]  && rm -- "$PAGEFILE"
else
	echo "Debug mode: retained $TMP2 /tmp/$THIS.tmp $PAGEFILE"
fi
exit 0
