:
##########################################################################
# Title      :	dailynews - get daily news messages
# Author     :	Heiner Steven <heiner.steven@odn.de>
# Date       :	1999-12-03
# Requires   :	dumphtmltbl, html2iso, recode, striphtml, wget, wordwrap
# Category   :	WWW, Desktop
# SCCS-Id.   :	@(#) dailynews	1.70 18/03/27
##########################################################################
# Description
#
##########################################################################

PN=`basename "$0"`			# Program name
VER='1.70'

##########################################################################

newssources="zdf heise spiegel sueddeutsche zdnet cnet scientist nature slashdot"
NEWSCNT=20			# max. number of head articles

##########################################################################

# Get HTML page by URL, write to stdout
unset GETURL GETURLFLAGS
: ${GETURL:=wget}
: ${GETURLFLAGS="-q -O-"}

: ${EGREP:=egrep}

usage () {
    echo >&2 "$PN - get daily news headlines, $VER (stv '99)
usage: $PN [-n maxarticles] [newssource ...]
    -n:  limit the number of articles (per news source), default: $NEWSCNT

If no news source was specified, all will be consulted.

Valid news sources:
    $newssources"
    exit 1
}

msg () {
    for msgLine
    do echo "$PN: $msgLine" >&2
    done
}

fatal () { msg "$@"; exit 1; }

##########################################################################
# Formatting helper functions - may need adjustments
##########################################################################

# urlget - retrieve data from the web by URL, print to standard output

urlget () {
    $GETURL $GETURLFLAGS "$@"
}

# numberlist - indent and number lines

numberlist () {
    cat "$@" |
    nl -s'. ' |			# number lines, number separator is '. '
    sed "$maxlines{s|^.*$|	[more...]|;q;}" | # limit max. number
    wordwrap -o 9
}

# Convert UTF-8-encoding to ISO 8859-15 encoding

convert_utf8_to_iso () {
    recode -f utf8..latin1 "$@"
}

# "canonical" white space - replace multiple blanks with exactly one blank

canonws () {
    sed 's/	/ /g;s/  */ /g' "$@"
}

# trim - remove leading or trailing whitespace characters of a line

trim () {
    sed 's/^[ 	]*//; s/[ 	]*$//' "$@"
}

rmemptylines () {
     $EGREP -v "^[ 	]*$" "$@"
}

jointagline () {
    $NAWK '
    {
    	s = $0; gsub (/[^<]*/, "", s)
	nopen += length (s);
    	s = $0; gsub (/[^>]*/, "", s)
	nclose += length (s);

	if ( nopen && nopen == nclose ) {
	    lastopen = match ($0, "<")
	    lastclose = match ($0, ">")
	    multiline = (lastopen > lastclose)
	}

	#print nopen, nclose, $0 | "cat >&2"

	if ( nopen == nclose && !multiline ) {
	    print
	    #print "o", nopen, nclose, $0 | "cat >&2"
	    nopen = nclose = 0
	} else {
	    #print "+", nopen, nclose, $0 | "cat >&2"
	    printf "%s", $0
	    multiline = 1
	}
    }
    END {
    	if ( multiline ) print ""
    }
' "$@"
}

preproc () {
    canonws "$@" |
    	trim |
	rmemptylines
}


# extract_rsstitles - extract <title> lines from a RSS feed
extract_rsstitles () {
    : ${todayexp?}

    tr -d '' |
	$NAWK '
	    {   # Add line-feed before each opening tag
		gsub(/<[^\/]/, "\n&");
		print
		#print "1<", $0 | "cat >&2"
	    }
	' |
	$NAWK '
	    function extract_cdata(v) {
	    	# <![CDATA[Krankheit]]>
		gsub(/<!\[CDATA\[/, "", v)
		gsub(/\]\]\>/, "", v)
		return v
	    }
	    #{ print "2<", $0 | "cat >&2"; }
	    /<[iI][tT][eE][mM][^>]*>/ { readitem = 1 }
	    readitem {
	    	# Read title on one or multiple lines
		title_end = "<\/[tT][iI][tT][lL][eE][^>]*>"
		if ($0 ~ /<[tT][iI][tT][lL][eE][^>]*>.*<\/\[tT][iI][tT][lL][eE][^>]*>/ ) {
		    #print "title in one line:", $0 | "cat >&2"
		    title = $0
		} else if ( $0 ~ /<[tT][iI][tT][lL][eE][^>]*>/ ) {
		    #print "title in more than one line:", $0 | "cat >&2"
		    title = extract_cdata($0)
		    do {
			if ( getline != 1 ) break
			title = title " " extract_cdata($0)
		    } while (!match(title, title_end))
		    #print "next line title", title | "cat >&2"
		}
		if (match($0, "\<[pP][uU][bB][dD][aA][tT][eE]\>.*" \
			todayexp)) {
		    datevalid = 1
		}
		# Example: "<dc:date>2013-05-02</dc:date>"
		if (match($0, "\<[a-zA-Z][a-zA-Z0-9]*:date\>.*" \
			todayexp)) {
		    datevalid = 1
		}
	    }
	    ##{ print "datevalid=" datevalid | "cat >&2"; }
	    /<\/[iI][tT][eE][mM]>/ {
		if (title != "" && datevalid) {
		    print title
		}
		title = ""
		readitem = 0
		datevalid = 0
	    }
	'
}

###############################################################################

outputvalid () {
    $NAWK '
    	BEGIN { dataseen = 0 }
    	{
	    print
	    dataseen = 1
       	}
	END {
	    exit(dataseen ? 0 : 1)
	}
    '
}

###############################################################################
# searchprog - search program using search PATH
# usage: searchprog program
###############################################################################

searchprog () {
    _search=$1; shift

    for _dir in `echo "$PATH" | sed "s/^:/.:/;s/:\$/:./;s/:/ /g"`
    do
        [ -x "$_dir/$_search" ] || continue
        echo "$_dir/$_search"
        return 0
    done

    return 1
}

###############################################################################
# MAIN PROGRAM

##############################################################################
# We need a "new" NAWK implementation with functions, "getline()",
# gsub()

: ${NAWK:=`searchprog mawk || searchprog gawk || searchprog nawk || echo awk`}

set -- `getopt hn: "$@"` || usage
[ $# -lt 1 ] && usage			# "getopt" detected an error

while [ $# -gt 0 ]
do
    case "$1" in
    	-n)
	    NewsCnt=$2; shift
	    case "$NewsCnt" in
	    	*[!0-9]*)	fatal "invalid number: $NewsCnt";;
	    esac
	    ;;
	--)	shift; break;;
	-h)	usage;;
	-*)	usage;;
	*)	break;;			# First file name
    esac
    shift
done

[ $# -lt 1 ] && set -- $newssources

maxlines=${NewsCnt:-$NEWSCNT}
maxlines=`expr "$maxlines" + 1`
: ${maxlines:=$NEWSCNT}

isodate=`date +%Y-%m-%d`
LongDate=`date '+%d.%m.%Y'`
echo "News $isodate"

# Check if there were at least one valid message, and set exit code
# accordingly
validmessage=false
error=0

for src
do
    case "$src" in
    zdf)
	#################################################################
	# Headlines from the "Zweites Deutsches Fernsehen" ZDF
	#################################################################

	url="https://www.zdf.de/rss/zdf/nachrichten"
	echo "
$url"
	day=`date "+%d" | sed 's/^0*//'`
	month=`date "+%b"`
	year=`date "+%Y"`
	date="$day.*$month.*$year"


	urlget "$url" |
	    $NAWK '
	    	/<[tT][Ii][tT][Ll][eE]>/ { title = $0 }
		/<[dD][eE][sS][cC][rR][iI][pP][tT][iI][oO][nN]>/ { desc = $0 }
		/<[pP][uU][bB][dD][aA][tT][eE]>.*'"$date"'/ {
		    if ( skippedheader )
			print title, "-", desc
		    skippedheader = 1
		}
	    ' |
	    html2iso |
	    striphtml |
	    preproc |
	    numberlist |
	    outputvalid && validmessage=true
	;;

    heise)
	#################################################################
	# Current news from the "Heise" publishing company
	#################################################################

	url="http://www.heise.de/newsticker/"
	echo "
$url"
	date=`date "+%Y-%m-%d"`
	urlget "$url" | tee tmp.1 |
	    $NAWK '
	    	/<time datetime="[^"][^"]*".*>/ {
		    if (match($0, /'"$date"'/) ) {
			datematch = 1
		    } else {
		    	datematch = 0
		    }
		}

		#datematch { print }

		datematch && /<[sS][pP][aA][nN].*/ { doprint = 1 }
		doprint { print }
		doprint && /<\/[sS][pP][aA][nN]>/ { doprint = 0; datematch = 0 }

	    ' |
	    convert_utf8_to_iso |
	    html2iso |		# convert German "Umlaute" to ISO 8859
	    striphtml |		# remove HTML tags
	    preproc | 		# remove whitespace
	    numberlist |
	    outputvalid && validmessage=true
	;;

    zdnet)
	#################################################################
	# Newsticker of ZDNet
	#################################################################

	url='http://www.zdnet.de/kategorie/news/'

	day='[0-3][0-9]'
	year=`date +%Y`

	# End pattern matches German date output format, e.g.
	#	22. August 2011
	today="$day\. [A-Z][a-zä][a-z][a-z]* $year"

	echo "
$url"

	urlget "$url" |
	    tr -d '\015' |			# Remove carriage-return
	    convert_utf8_to_iso |
	    html2iso |
	    striphtml |
	    trim |
	    $NAWK '
	        /^'"$today"'/ {
		    title = prevline
		    while (getline > 0 && $1 == "") ;
		    text = $0
		    gsub(/\&raquo;.*weiter/, "", text)
		    print title, "-", text
		}
	    	$0 != "" { prevline = $0 }
	    ' |
	    numberlist |
	    outputvalid && validmessage=true
	;;

    cnet)
	#################################################################
	# Current news from C|Net
	#################################################################

	url="http://news.cnet.com/news/"
	echo "
$url"

	date=`date "+%B %d, %Y"`
	yesterday=`TZ=+24 date "+%B %d, %Y"`
	datepattern="($date|$yesterday)"
	# Match leading zeros: "01 May" -> "0*1 May"
	datepattern=`echo "$datepattern" | sed 's/ 0/ 0*/g'`

	urlget "$url" |
	    tr -d '' |
	    sed -e 's|</p>>|&\n|g' \
	    	-e 's|<p[^>]*>|\n&|g' \
	    	-e 's|<a[^>]*>|\n&|g' \
	    	-e 's|<div>>|\n&|g' \
	    	-e 's|</div>>|&\n|g' |
	    $NAWK '
		$0 ~ /section="topStories"/ {
		    title = $0
		    print title
		}
		$0 ~ /datestamp.*'"$datepattern"'/ {
		    title = prev
		    print title
		}
		{ prev = $0 }
	    ' |
	    sed 's/<[^>]*>//g' |	# "striphtml", but no <br> handling
	    preproc |
	    numberlist |
	    outputvalid && validmessage=true
	;;

    scientist)
	#################################################################
	# Headlines from the "New Scientist"
	#################################################################

	url="http://feeds.newscientist.com/science-news"
	echo "
$url"
	today=`date "+%d %B %Y"`
	yesterday=`TZ=GMT+24 date "+%d %B %Y"`
	todayexp="$today|$yesterday"

	urlget "$url" |
	    extract_rsstitles |
	    striphtml |
	    convert_utf8_to_iso |
	    preproc |
	    numberlist |
	    outputvalid && validmessage=true
	;;

    nature)
	#################################################################
	# Headlines from the "Nature"
	#################################################################

	url="http://feeds.nature.com/nature/rss/current?format=xml"
	echo "
$url"
	today=`TZ=GMT date "+%d %B %Y"`
	today_iso=`TZ=GMT date "+%Y-%m-%d"`
	#yesterday=`TZ=GMT+24 date "+%d %B %Y"`
	#yesterday_iso=`TZ=GMT+24 date "+%Y-%m-%d"`
	#todayexp="($today|$yesterday|$today_iso|$yesterday_iso)"
	todayexp="($today|$today_iso)"

	urlget "$url" |
	    extract_rsstitles |
	    html2iso |
	    striphtml |
	    preproc |
	    numberlist |
	    outputvalid && validmessage=true
	;;

    nbc)
	#################################################################
	# Current news from NBC -- NOT USED
	#################################################################

	url="http://www.msnbc.com/news/news_front.asp"
	echo "
$url"

	urlget "$url" |
	    striphtml |
	    sed -n '/TOP[ 	]*STORIES/,$p' |
	    sed -n 's/^ \(.*\)/\1/p' |
	    numberlist |
	    outputvalid && validmessage=true
	;;

    slashdot | /.)
	#################################################################
	# Current news from SlashDot
	#################################################################

	url="http://rss.slashdot.org/Slashdot/slashdot"
	echo "
$url"
	todayexp=`LANG=C date '+%d %b %Y'`

	urlget "$url" |
	    extract_rsstitles |
	    html2iso |
	    striphtml |
	    preproc |
	    numberlist |
	    outputvalid && validmessage=true
	;;

    spiegel)
	#################################################################
	# Current news from "Spiegel Online"
	#################################################################

	url="http://www.spiegel.de/schlagzeilen/tops/index.rss"
	echo "
$url"
	todayexp=`LANG=C date '+%d %b %Y'`

	urlget "$url" |
	    extract_rsstitles |
	    striphtml |
	    convert_utf8_to_iso |
	    preproc |
	    numberlist |
	    outputvalid && validmessage=true
	;;

    sueddeutsche)
	#################################################################
	# Current news from "Sueddeutsche Zeitung"
	#################################################################

	#url="http://rss.feedsportal.com/795/f/449002/index.rss"
	#url="http://rssfeed.sueddeutsche.de/c/795/f/449002/index.rss"
	url="http://rss.sueddeutsche.de/rss/Topthemen"
	echo "
$url"
	todayexp=`LANG=C date '+%d %b %Y'`

	urlget "$url" |
	    extract_rsstitles |
	    convert_utf8_to_iso |
	    striphtml |
	    preproc |
	    numberlist |
	    outputvalid && validmessage=true
	;;

    *)
    	fatal "invalid news source: $src
	valid: $newssources"
	;;
    esac
done

if [ "$validmessage" != "true" ]
then
    msg "ERROR: no news output line"
    error=1
fi

exit $error