: ########################################################################## # Title : dailynews - get daily news messages # Author : Heiner Steven # Date : 1999-12-03 # Requires : dumphtmltbl, html2iso, recode, striphtml, wget, wordwrap # Category : WWW, Desktop # SCCS-Id. : @(#) dailynews 1.70 18/03/27 ########################################################################## # Description # ########################################################################## PN=`basename "$0"` # Program name VER='1.70' ########################################################################## newssources="zdf heise spiegel sueddeutsche zdnet cnet scientist nature slashdot" NEWSCNT=20 # max. number of head articles ########################################################################## # Get HTML page by URL, write to stdout unset GETURL GETURLFLAGS : ${GETURL:=wget} : ${GETURLFLAGS="-q -O-"} : ${EGREP:=egrep} usage () { echo >&2 "$PN - get daily news headlines, $VER (stv '99) usage: $PN [-n maxarticles] [newssource ...] -n: limit the number of articles (per news source), default: $NEWSCNT If no news source was specified, all will be consulted. Valid news sources: $newssources" exit 1 } msg () { for msgLine do echo "$PN: $msgLine" >&2 done } fatal () { msg "$@"; exit 1; } ########################################################################## # Formatting helper functions - may need adjustments ########################################################################## # urlget - retrieve data from the web by URL, print to standard output urlget () { $GETURL $GETURLFLAGS "$@" } # numberlist - indent and number lines numberlist () { cat "$@" | nl -s'. ' | # number lines, number separator is '. ' sed "$maxlines{s|^.*$| [more...]|;q;}" | # limit max. number wordwrap -o 9 } # Convert UTF-8-encoding to ISO 8859-15 encoding convert_utf8_to_iso () { recode -f utf8..latin1 "$@" } # "canonical" white space - replace multiple blanks with exactly one blank canonws () { sed 's/ / /g;s/ */ /g' "$@" } # trim - remove leading or trailing whitespace characters of a line trim () { sed 's/^[ ]*//; s/[ ]*$//' "$@" } rmemptylines () { $EGREP -v "^[ ]*$" "$@" } jointagline () { $NAWK ' { s = $0; gsub (/[^<]*/, "", s) nopen += length (s); s = $0; gsub (/[^>]*/, "", s) nclose += length (s); if ( nopen && nopen == nclose ) { lastopen = match ($0, "<") lastclose = match ($0, ">") multiline = (lastopen > lastclose) } #print nopen, nclose, $0 | "cat >&2" if ( nopen == nclose && !multiline ) { print #print "o", nopen, nclose, $0 | "cat >&2" nopen = nclose = 0 } else { #print "+", nopen, nclose, $0 | "cat >&2" printf "%s", $0 multiline = 1 } } END { if ( multiline ) print "" } ' "$@" } preproc () { canonws "$@" | trim | rmemptylines } # extract_rsstitles - extract lines from a RSS feed extract_rsstitles () { : ${todayexp?} tr -d ' ' | $NAWK ' { # Add line-feed before each opening tag gsub(/<[^\/]/, "\n&"); print #print "1<", $0 | "cat >&2" } ' | $NAWK ' function extract_cdata(v) { # <![CDATA[Krankheit]]> gsub(/<!\[CDATA\[/, "", v) gsub(/\]\]\>/, "", v) return v } #{ print "2<", $0 | "cat >&2"; } /<[iI][tT][eE][mM][^>]*>/ { readitem = 1 } readitem { # Read title on one or multiple lines title_end = "<\/[tT][iI][tT][lL][eE][^>]*>" if ($0 ~ /<[tT][iI][tT][lL][eE][^>]*>.*<\/\[tT][iI][tT][lL][eE][^>]*>/ ) { #print "title in one line:", $0 | "cat >&2" title = $0 } else if ( $0 ~ /<[tT][iI][tT][lL][eE][^>]*>/ ) { #print "title in more than one line:", $0 | "cat >&2" title = extract_cdata($0) do { if ( getline != 1 ) break title = title " " extract_cdata($0) } while (!match(title, title_end)) #print "next line title", title | "cat >&2" } if (match($0, "\<[pP][uU][bB][dD][aA][tT][eE]\>.*" \ todayexp)) { datevalid = 1 } # Example: "<dc:date>2013-05-02</dc:date>" if (match($0, "\<[a-zA-Z][a-zA-Z0-9]*:date\>.*" \ todayexp)) { datevalid = 1 } } ##{ print "datevalid=" datevalid | "cat >&2"; } /<\/[iI][tT][eE][mM]>/ { if (title != "" && datevalid) { print title } title = "" readitem = 0 datevalid = 0 } ' } ############################################################################### outputvalid () { $NAWK ' BEGIN { dataseen = 0 } { print dataseen = 1 } END { exit(dataseen ? 0 : 1) } ' } ############################################################################### # searchprog - search program using search PATH # usage: searchprog program ############################################################################### searchprog () { _search=$1; shift for _dir in `echo "$PATH" | sed "s/^:/.:/;s/:\$/:./;s/:/ /g"` do [ -x "$_dir/$_search" ] || continue echo "$_dir/$_search" return 0 done return 1 } ############################################################################### # MAIN PROGRAM ############################################################################## # We need a "new" NAWK implementation with functions, "getline()", # gsub() : ${NAWK:=`searchprog mawk || searchprog gawk || searchprog nawk || echo awk`} set -- `getopt hn: "$@"` || usage [ $# -lt 1 ] && usage # "getopt" detected an error while [ $# -gt 0 ] do case "$1" in -n) NewsCnt=$2; shift case "$NewsCnt" in *[!0-9]*) fatal "invalid number: $NewsCnt";; esac ;; --) shift; break;; -h) usage;; -*) usage;; *) break;; # First file name esac shift done [ $# -lt 1 ] && set -- $newssources maxlines=${NewsCnt:-$NEWSCNT} maxlines=`expr "$maxlines" + 1` : ${maxlines:=$NEWSCNT} isodate=`date +%Y-%m-%d` LongDate=`date '+%d.%m.%Y'` echo "News $isodate" # Check if there were at least one valid message, and set exit code # accordingly validmessage=false error=0 for src do case "$src" in zdf) ################################################################# # Headlines from the "Zweites Deutsches Fernsehen" ZDF ################################################################# url="https://www.zdf.de/rss/zdf/nachrichten" echo " $url" day=`date "+%d" | sed 's/^0*//'` month=`date "+%b"` year=`date "+%Y"` date="$day.*$month.*$year" urlget "$url" | $NAWK ' /<[tT][Ii][tT][Ll][eE]>/ { title = $0 } /<[dD][eE][sS][cC][rR][iI][pP][tT][iI][oO][nN]>/ { desc = $0 } /<[pP][uU][bB][dD][aA][tT][eE]>.*'"$date"'/ { if ( skippedheader ) print title, "-", desc skippedheader = 1 } ' | html2iso | striphtml | preproc | numberlist | outputvalid && validmessage=true ;; heise) ################################################################# # Current news from the "Heise" publishing company ################################################################# url="http://www.heise.de/newsticker/" echo " $url" date=`date "+%Y-%m-%d"` urlget "$url" | tee tmp.1 | $NAWK ' /<time datetime="[^"][^"]*".*>/ { if (match($0, /'"$date"'/) ) { datematch = 1 } else { datematch = 0 } } #datematch { print } datematch && /<[sS][pP][aA][nN].*/ { doprint = 1 } doprint { print } doprint && /<\/[sS][pP][aA][nN]>/ { doprint = 0; datematch = 0 } ' | convert_utf8_to_iso | html2iso | # convert German "Umlaute" to ISO 8859 striphtml | # remove HTML tags preproc | # remove whitespace numberlist | outputvalid && validmessage=true ;; zdnet) ################################################################# # Newsticker of ZDNet ################################################################# url='http://www.zdnet.de/kategorie/news/' day='[0-3][0-9]' year=`date +%Y` # End pattern matches German date output format, e.g. # 22. August 2011 today="$day\. [A-Z][a-zä][a-z][a-z]* $year" echo " $url" urlget "$url" | tr -d '\015' | # Remove carriage-return convert_utf8_to_iso | html2iso | striphtml | trim | $NAWK ' /^'"$today"'/ { title = prevline while (getline > 0 && $1 == "") ; text = $0 gsub(/\».*weiter/, "", text) print title, "-", text } $0 != "" { prevline = $0 } ' | numberlist | outputvalid && validmessage=true ;; cnet) ################################################################# # Current news from C|Net ################################################################# url="http://news.cnet.com/news/" echo " $url" date=`date "+%B %d, %Y"` yesterday=`TZ=+24 date "+%B %d, %Y"` datepattern="($date|$yesterday)" # Match leading zeros: "01 May" -> "0*1 May" datepattern=`echo "$datepattern" | sed 's/ 0/ 0*/g'` urlget "$url" | tr -d ' ' | sed -e 's|</p>>|&\n|g' \ -e 's|<p[^>]*>|\n&|g' \ -e 's|<a[^>]*>|\n&|g' \ -e 's|<div>>|\n&|g' \ -e 's|</div>>|&\n|g' | $NAWK ' $0 ~ /section="topStories"/ { title = $0 print title } $0 ~ /datestamp.*'"$datepattern"'/ { title = prev print title } { prev = $0 } ' | sed 's/<[^>]*>//g' | # "striphtml", but no <br> handling preproc | numberlist | outputvalid && validmessage=true ;; scientist) ################################################################# # Headlines from the "New Scientist" ################################################################# url="http://feeds.newscientist.com/science-news" echo " $url" today=`date "+%d %B %Y"` yesterday=`TZ=GMT+24 date "+%d %B %Y"` todayexp="$today|$yesterday" urlget "$url" | extract_rsstitles | striphtml | convert_utf8_to_iso | preproc | numberlist | outputvalid && validmessage=true ;; nature) ################################################################# # Headlines from the "Nature" ################################################################# url="http://feeds.nature.com/nature/rss/current?format=xml" echo " $url" today=`TZ=GMT date "+%d %B %Y"` today_iso=`TZ=GMT date "+%Y-%m-%d"` #yesterday=`TZ=GMT+24 date "+%d %B %Y"` #yesterday_iso=`TZ=GMT+24 date "+%Y-%m-%d"` #todayexp="($today|$yesterday|$today_iso|$yesterday_iso)" todayexp="($today|$today_iso)" urlget "$url" | extract_rsstitles | html2iso | striphtml | preproc | numberlist | outputvalid && validmessage=true ;; nbc) ################################################################# # Current news from NBC -- NOT USED ################################################################# url="http://www.msnbc.com/news/news_front.asp" echo " $url" urlget "$url" | striphtml | sed -n '/TOP[ ]*STORIES/,$p' | sed -n 's/^ \(.*\)/\1/p' | numberlist | outputvalid && validmessage=true ;; slashdot | /.) ################################################################# # Current news from SlashDot ################################################################# url="http://rss.slashdot.org/Slashdot/slashdot" echo " $url" todayexp=`LANG=C date '+%d %b %Y'` urlget "$url" | extract_rsstitles | html2iso | striphtml | preproc | numberlist | outputvalid && validmessage=true ;; spiegel) ################################################################# # Current news from "Spiegel Online" ################################################################# url="http://www.spiegel.de/schlagzeilen/tops/index.rss" echo " $url" todayexp=`LANG=C date '+%d %b %Y'` urlget "$url" | extract_rsstitles | striphtml | convert_utf8_to_iso | preproc | numberlist | outputvalid && validmessage=true ;; sueddeutsche) ################################################################# # Current news from "Sueddeutsche Zeitung" ################################################################# #url="http://rss.feedsportal.com/795/f/449002/index.rss" #url="http://rssfeed.sueddeutsche.de/c/795/f/449002/index.rss" url="http://rss.sueddeutsche.de/rss/Topthemen" echo " $url" todayexp=`LANG=C date '+%d %b %Y'` urlget "$url" | extract_rsstitles | convert_utf8_to_iso | striphtml | preproc | numberlist | outputvalid && validmessage=true ;; *) fatal "invalid news source: $src valid: $newssources" ;; esac done if [ "$validmessage" != "true" ] then msg "ERROR: no news output line" error=1 fi exit $error