#! /usr/bin/ksh
##########################################################################
# Title      :	search-swish.cgi - search using "swish-e" (http://swish-e.org)
# Version    :	%I%
# Author     :	Heiner Steven <heiner.steven@odn.de>
# Date       :	2002-03-06
# Category   :	CGI
# Requires   :	swish-search, urlgetopt
# SCCS-Id.   :	%Z% %M%	%I% %D%
##########################################################################
# Description
#    External parameters
#	QUERY	query string
#
# Note
#    Needs the following non-standard programs
#	urlgetopt, swish-e
##########################################################################

PN=${0##*/}			# Program name
VER='%I%'

##########################################################################
# Configuration section
##########################################################################

[ -r ./setenv ] && . ./setenv

: ${NAWK:=nawk}

DEF_PAGESIZE=10			# number of matches per page

#Debug=true			# {false|true}
Debug=false			# {false|true}
DocumentRoot=${DOCUMENT_ROOT:=/var/apache/htdocs}
SwishIndex=$DocumentRoot/index.swish-e
WEBROOT=http://www.shelldorado.com

libbase=$DocumentRoot/search
Header=$libbase/header.html
Footer=$libbase/footer.html

##########################################################################
# End configuration section
##########################################################################

# For searching we will "eval" the following command line. Note that the
# quoting is essential to preserve whitespace within arguments.

searchcmd='swish-search -d "	" -f "$SwishIndex" -w "$*"'

# Mark the names of all variable we want to accept on the command line as
# "tagged". Variables not listed will not be accepted.
# Define new variables here!

typeset -t QUERY=	\
	OLDQUERY=	\
	NEXTPAGE=	\
	NUM=		\
	START=

# We need "urlgetopt", "swish-e"
PATH=$PATH:$HOME/local/bin:$PWD:/bin
export PATH

function Usage {
    print -u2 "$PN - short description, $VER (hs '00)"

    if $iscgi
    then
    	print "<pre>
Valid CGI arguments:
    QUERY	query string
    NUM		number of matches to list per page
    NEXTPAGE	{ \"|<\" | \"<<\" | \">>\" | \">|\" }
		for first, previous, next, or last page
</pre>"
    else
    	print -u2 "usage: $PN query [...]"
    fi
    exit 1
}

function Msg {
    print -u2 "$PN:" "$@"
}

function Fatal { Msg "$@"; exit 1; }

##########################################################################

function isnumber {
    (( $# != 1 )) && Fatal "$0: wrong number of arguments"
    [[ $1 == +([0-9]) ]]
}

function min # num1 [num2 ...]
{
    typeset minval=$1; shift
    while (( $# > 0 ))
    do
    	(( $1 < $minval )) && minval=$1
	shift
    done
    print -- "$minval"
}

function max # num1 [num2 ...]
{
    typeset maxval=$1; shift
    while (( $# > 0 ))
    do
    	(( $1 > $maxval )) && maxval=$1
	shift
    done
    print -- "$maxval"
}

##########################################################################

function striphtml {
    sed -e 's/$//;s/<[^>]*>//g' "$@"
}

##########################################################################

function urlencode
{
	$NAWK '
	    BEGIN {
		# We assume an awk implementation that is just plain dumb.
		# We will convert an character to its ASCII value with the
		# table ord[], and produce two-digit hexadecimal output
		# without the printf("%02X") feature.

		EOL = "%0A"		# "end of line" string (encoded)
		split ("1 2 3 4 5 6 7 8 9 A B C D E F", hextab, " ")
		hextab [0] = 0
		for ( i=1; i<=255; ++i ) ord [ sprintf ("%c", i) "" ] = i + 0
	    }
	    {
		encoded = ""
		for ( i=1; i<=length ($0); ++i ) {
		    c = substr ($0, i, 1)
		    if ( c ~ /[a-zA-Z0-9.-]/ ) {
			encoded = encoded c		# safe character
		    } else if ( c == " " ) {
			encoded = encoded "+"	# special handling
		    } else {
			# unsafe character, encode it as a two-digit hex-number
			lo = ord [c] % 16
			hi = int (ord [c] / 16);
			encoded = encoded "%" hextab [hi] hextab [lo]
		    }
		}
		print encoded
	    }
	' "$@"
}

function htmlencode
{
    sed 's/"/\&quot;/g' <<EOT
$*
EOT
}

##########################################################################

function mkurl
{
    typeset var= value=
    typeset cmd=
    typeset url="NUM=$NUM&QUERY=$QUERY&OLDQUERY=$OLDQUERY"
    typeset htmlencoded=$(htmlencode "$url")

    print -- "$htmlencoded"
    return 0

    # TODO: dynamically generate line using "tagged" variables
    for var in $(typeset -t | cut -d= -f1)
    do
	value=$(eval "print -- \$$var" | urlencode)
	cmd="${cmd:+$cmd&}$var=$value"
    done
    print -- "$cmd"
}

##########################################################################

function printnavigation {
    typeset htmlencoded=$(htmlencode "$QUERY")

    cat <<-EOT
	<form method="GET" action="$SCRIPT_NAME">
	Search term
	<input type="text" name="QUERY" value="$htmlencoded" size="30">
	<input type="hidden" name="OLDQUERY" value="$htmlencoded">
	<input type="submit" value="Go!" title="Start searching">
	EOT

    ((${MAX:-0} > 0)) || return

    cat <<-EOT
	0 <input type="submit" name="NEXTPAGE" value="|<" title="First Page">
	<input type="submit" name="NEXTPAGE" value="<<" title="Previous Page"> $START
	<input type="submit" name="NEXTPAGE" value=">>" title="Next Page">
	<input type="submit" name="NEXTPAGE" value=">|" title="Last Page">
	$MAX
	<input type="hidden" name="START" value="$START">
	</form>
	EOT
}

##########################################################################

function evalnavigation {
    if [[ -n $OLDQUERY && -n $QUERY && $QUERY != $OLDQUERY ]]
    then
    	START=0
    fi

    : ${START:=0}
    : ${NUM:=$DEF_PAGESIZE}
    : ${MAX:=0}

    case "$NEXTPAGE" in
	("|<")
	    ((START=0))
	    ;;

	("<<")
	    if (($START > $NUM))
	    then ((START=$START-$NUM))
	    else START=0
	    fi
	    ;;

	(">>")
	    if (( $START + $NUM <= $MAX))
	    then ((START=$START+$NUM))
	    fi
	    ;;

	(">|")
	    if (($MAX > $NUM))
	    then ((START=$MAX-$MAX%$NUM))	# Start of last page
	    else ((START=0))
	    fi
	    ;;
    esac
}

##########################################################################

function formathtml {
    $NAWK -F"	" '
    	# Input: ranking<TAB>url<TAB>title<TAB>filesize<NL>

    	function blockgraph(percent,	p, q, ret) {
	    p = int(percent % 101)
	    q = 100 - percent
	    ret = "<table width=\"100%\" border=\"1\" cellpadding=\"0\" cellspacing=\"0\">"
	    pstr = "&nbsp;" p "&nbsp;"
	    qstr = "&nbsp;"

	    ret = ret "<tr><td width=\"" p+0 "%\" bgcolor=\"#0000FF\"><font color=\"#FFFFFF\">" pstr "</font></td>"
	    ret = ret "<td width=\"" q+0 "%\">" qstr "</td>"
	    ret = ret "</tr>"
	    ret = ret "</table>"
	    return ret
	}
       
    	BEGIN {
	    cols = 2
	    nmatches = 0
	    webroot  = "'"$WEBROOT"'"
	    print "<table border=\"0\" cellpadding=\"1\" cellspacing=\"0\" width=\"80%\">"
	}
    	$1 ~ /^#/ { next }			# ignore comments
	NF == 4 {
	    ++nmatches
	    ranking  = $1
	    url      = $2
	    title    = $3
	    filesize = $4
	    percent  = int(ranking/10)
	    if ( (p = match (url, /[^\/]*$/)) > 0 ) {
		filename = substr (url, p)
	    } else {
		filename = url
	    }

	    if ( title == filename ) {
	    	description =  "<a href=\"" webroot "/" url "\">" title "</a>"
	    } else {
	    	description = "<a href=\"" webroot "/" url "\">" title "</a> - " filename 
	    }

	    if ( nmatches == 1 ) {
	    print "<tr>" 						\
	    	    "<th>Relevance</th>"				\
	    	    " <th>Description<br><th>"				\
		    "</tr>"
	    }

	    graph = blockgraph(percent)
	    print "<tr>" 						\
	    	    "<td>" graph "</td>"				\
	    	    " <td>" description "<br></td>"				\
		    "</tr>"
	}
	END {
	    print "</table>"
	}
    '
}

##########################################################################
# Start of main program
##########################################################################

integer START=0			# number of first match to display
integer NUM=$DEF_PAGESIZE	# maximum number of matches per page

if [[ -n $REQUEST_METHOD ]]
then iscgi=true
else iscgi=false
fi

if $iscgi
then
    calledname=${SCRIPT_NAME##*/}
    if [[ $calledname == $PN ]]
    then standalone=true
    else standalone=false
    fi
fi

contenttype=text/html

if $iscgi
then
    #$Debug && contenttype=text/plain
    $standalone && print "Content-type: $contenttype\n"
    #$Debug && export
    $Debug && print "<pre>"

exec 2>&1
#set -x

    case "$REQUEST_METHOD" in
    	(POST)	read -r QUERY_STRING || exit 2;;
	(GET)	;;
	(*)	;;
    esac

    # Create variables of the form "VALID_FORM_x" with value "true"
    # for variable "x" we want to accept as CGI input.
    # They are used to simplify the checking if a variable is valid or
    # not. Note that they *must* not start with "FORM_", because
    # otherwise an attacker could provide it as an argument to this
    # script.

    for var in $(typeset -t | cut -d= -f1)
    do eval "VALID_FORM_$var=true"
    done

    # Remove all variables that look like they were provided
    # from the command line, but are not. Comment this code out
    # if you want to use these names to set default values.

    for var in $(set | grep ^FORM_ | cut -d= -f1)
    do unset "$var"
    done

    # The following lines evaluates all variable assignments, and creates
    # variables with the prefix "FORM_", e.g. "name=a" would create
    # an environment variable FORM_name with the content "a".
    # NOTE:
    #	THIS IS A SECURITY RISK, because if there is a bug in
    #	"urlgetopt", it may execute code a user provided!

#set -x
    setvars=$(./urlgetopt -l -p FORM_ "$QUERY_STRING")
    #print "setvars=|$setvars|"
    eval "$setvars"
#set +x

    # Now check if only valid variables were specified as arguments, and
    # verify the content is valid.

    for varname in $(set | grep ^FORM_ | cut -d= -f1)
    do
	# Paranoia: check if this really is a valid identifier name.
	[[ $varname == [a-zA-Z_]+([a-zA-Z0-9_]) ]] ||
		Fatal "Internal Error: invalid variable name: $varname"

    	case "$varname" in
	    (FORM_START)
	    	isnumber "$FORM_START" || FORM_START=0
		START=$FORM_START
		;;

	    (FORM_NUM)
	    	isnumber "$FORM_NUM" || FORM_NUM=0
		NUM=$FORM_NUM
		;;

	    (*)
	    	# No special handling for this variable; just check
		# if it is a valid one

		eval "vname=\"VALID_$varname\""
		if eval "[[ \"\${$vname:-false}\" != true ]]"
		then
		    print -u2 "invalid variable name: $varname"
		    unset $varname
		    Usage
		    # not reached
		fi

		# Uncomment the following line, if you want to create a
		# variable with the same name as the CGI argument, but without
		# the "FORM_" prefix, e.g. to copy the value of "FORM_QUERY" to
		# the variable "QUERY".

		# DANGER
#set -vx
		eval "${varname#FORM_}=\"\$$varname\""
#set +vx
		;;
	esac
    done

    set -- $QUERY
else
    while getopts :h opt
    do
	case "$opt" in
	    # your flags here
	    #f)	argument=$OPTARG;;
	    h)	Usage;;
	    ?)	Usage;;
	esac
    done
    shift OPTIND-1
fi

#(( $# < 1 )) && exit 0

: ${NUM:=500}
: ${START:=0}

if (( $# > 0 ))
then
    # For the browsing feature (more precise: the "last page" button) to
    # work, we need the number of matches. "search-swish" returns this
    # number in the following format:
    #	# Number of hits: 25
    #
    # Our first search only is used to count the number of matches:

    nmatches=$(eval "$searchcmd -m 1" | egrep -i '^# Number of hits')
    nmatches=${nmatches##* }	# we only want the last column of the output
    MAX=$nmatches

    evalnavigation
fi

cat "$Header"

$iscgi && printnavigation

(($# > 0)) || exit 0

(($NUM <= 0)) && NUM=20

# We calculate offsets to the first match in "number of matches", and in
# "number of pages". All page numbers start with "0" (to simplify
# calculations), and will be displayed "1"-based.

# Print this number of pages before and after the matching page
integer surrpages=10
: ${nmatches:=0}

if (( $nmatches < 1 ))
then
    print "<p><hr>no matches found"
else
    integer npages=$((nmatches/$NUM+1))

    integer curpage=$(($START/$NUM))
    integer startpage=$(max $(($curpage-$surrpages)) 0)
    integer lastpage=$(min $(($curpage+$surrpages)) $npages)

    #print "nmatches=$nmatches, npages=$npages"
    #print "<pre>0 <= start $startpage <= $curpage <= last $lastpage <= $npages</pre>"

    print "Page $(($curpage+1)) of <strong>$npages</strong>: "

    myurl="$SCRIPT_NAME?$(mkurl)"

    integer pi=$startpage

    while (($pi < $lastpage))
    do
	((offset=$pi*$NUM))
	(($pi > $startpage)) && print -- "-"
	#print "DEBUG: page $currentpage: QUERY=$*&START=$offset&NUM=$NUM<br>"
	if (( $pi == $curpage ))
	then
	    print "<font color=#ff0000><strong>$(($curpage+1))</strong></font>"
	else
	    print "<a href=\"$myurl&START=$offset\">$(($pi+1))</a>"
	fi
	((pi=$pi+1))
    done
    print "<hr>"

    eval "$searchcmd -b \"$START\" -m \"$NUM\"" |
	if $iscgi
	then
	    formathtml
	else
	    egrep -v '^# '
	fi
fi

$Debug && print "</pre>"

print "<p>View <a href=\"$WEBROOT/search/search-swish.cgi\">source code</a>" \
	" of this shell script"

cat "$Footer"

exit 0
