: ########################################################################## # Title : dailynews - get daily news messages # Author : Heiner Steven # Date : 1999-12-03 # Requires : dumphtmltbl, html2iso, recode, striphtml, wget, wordwrap # Category : WWW, Desktop # SCCS-Id. : @(#) dailynews 1.67 10/07/11 ########################################################################## # Description # ########################################################################## PN=`basename "$0"` # Program name VER='1.67' ########################################################################## newssources="zdf heise spiegel zdnet cnet scientist slashdot" NEWSCNT=20 # max. number of head articles ########################################################################## # Get HTML page by URL, write to stdout : ${GETURL:=wget} : ${GETURLFLAGS="-q -O-"} : ${EGREP:=egrep} usage () { echo >&2 "$PN - get daily news headlines, $VER (stv '99) usage: $PN [-n maxarticles] [newssource ...] -n: limit the number of articles (per news source), default: $NEWSCNT If no news source was specified, all will be consulted. Valid news sources: $newssources" exit 1 } msg () { for msgLine do echo "$PN: $msgLine" >&2 done } fatal () { msg "$@"; exit 1; } ########################################################################## # Formatting helper functions - may need adjustments ########################################################################## # urlget - retrieve data from the web by URL, print to standard output urlget () { GETURL=wget GETURLFLAGS="-O- -q" $GETURL $GETURLFLAGS "$@" } # numberlist - indent and number lines numberlist () { cat "$@" | nl -s'. ' | # number lines, number separator is '. ' sed "$maxlines{s|^.*$| [more...]|;q;}" | # limit max. number wordwrap -o 9 } # "canonical" white space - replace multiple blanks with exactly one blank canonws () { sed 's/ / /g;s/ */ /g' "$@" } # trim - remove leading or trailing whitespace characters of a line trim () { sed 's/^[ ]*//; s/[ ]*$//' "$@" } rmemptylines () { $EGREP -v "^[ ]*$" "$@" } jointagline () { $NAWK ' { s = $0; gsub (/[^<]*/, "", s) nopen += length (s); s = $0; gsub (/[^>]*/, "", s) nclose += length (s); if ( nopen && nopen == nclose ) { lastopen = match ($0, "<") lastclose = match ($0, ">") multiline = (lastopen > lastclose) } #print nopen, nclose, $0 | "cat >&2" if ( nopen == nclose && !multiline ) { print #print "o", nopen, nclose, $0 | "cat >&2" nopen = nclose = 0 } else { #print "+", nopen, nclose, $0 | "cat >&2" printf "%s", $0 multiline = 1 } } END { if ( multiline ) print "" } ' "$@" } preproc () { canonws "$@" | trim | rmemptylines } # getrsstitle - get lines of an RSS feed getrsstitle () { $NAWK ' /\<[ ]*[iI][tT][eE][mM][ ]*\>/ { while (getline > 0) { print if ($0 ~ /<[ ]*\/[iI][tT][eE][mM][ ]*>/) break } }' } ############################################################################### # searchprog - search program using search PATH # usage: searchprog program ############################################################################### searchprog () { _search=$1; shift for _dir in `echo "$PATH" | sed "s/^:/.:/;s/:\$/:./;s/:/ /g"` do [ -x "$_dir/$_search" ] || continue echo "$_dir/$_search" return 0 done return 1 } ############################################################################### # MAIN PROGRAM ############################################################################### # We need a "new" NAWK implementation with functions, "getline()", # gsub() : ${NAWK:=`searchprog mawk || searchprog gawk || searchprog nawk || echo awk`} set -- `getopt hn: "$@"` || usage [ $# -lt 1 ] && usage # "getopt" detected an error while [ $# -gt 0 ] do case "$1" in -n) NewsCnt=$2; shift case "$NewsCnt" in *[!0-9]*) fatal "invalid number: $NewsCnt";; esac ;; --) shift; break;; -h) usage;; -*) usage;; *) break;; # First file name esac shift done [ $# -lt 1 ] && set -- $newssources maxlines=${NewsCnt:-$NEWSCNT} maxlines=`expr "$maxlines" + 1` : ${maxlines:=$NEWSCNT} isodate=`date +%Y-%m-%d` LongDate=`date '+%d.%m.%Y'` echo "News $isodate" for src do case "$src" in zdf) ################################################################# # Headlines from the "Zweites Deutsches Fernsehen" ZDF ################################################################# url="http://www.heute.de/ZDFheute" echo " $url" urlget "$url" | $NAWK ' /sb-teaser1[ ]/ { expectheader = 1 } /header-top/ && expectheader == 1 { header = $0 expectheader = 2 next } /<h1[^>]*>/ && expectheader == 2 { # "striphtml" is confused by ">" within attributes, e.g. # <a title="two<br/>lines">abc</a> gsub(/"[^"]*>[^"]*"/, "\"\"") print header " - " $0 header = "" expectheader = 0 } ' | html2iso | striphtml | preproc | numberlist ;; heise) ################################################################# # Current news from the "Heise" publishing company ################################################################# url="http://www.heise.de/newsticker/" echo " $url" urlget "$url" | $NAWK ' { gsub(/\<\/*div[^>]*>/, "&\n") print } ' | recode utf8..latin1 | # Convert UTF-8 characgters to ISO 8859 html2iso | # convert German "Umlaute" to ISO 8859 striphtml | # remove HTML tags preproc | # remove whitespace $NAWK ' $1 == "'"$LongDate"'" { $1 = "" if ($0 ~ /./) print while ( getline > 0 && \ $1 !~ /^[0-3][0-9]\.[0-9][0-9]\.2[0-9][0-9][0-9]/) { if ( $0 ~ /^[ ]*$/ ) continue print } } ' | numberlist ;; zdnet) ################################################################# # Newsticker of ZDNet ################################################################# url='http://www.zdnet.de/news/' year=`date +%Y` start="Nachrichten des Tages" timespec="[ ]*[0-9][0-9]*\.[0-9][0-9]*\.[0-9][0-9]*[ ,][ ]*[0-9][0-9]*:[0-9][0-9]*[ ][uU][hH][rR]" echo " $url" urlget "$url" | tr -d '\015' | # Remove carriage-return html2iso | striphtml | trim | $NAWK ' /'"$start"'/ && !headerseen { while ( getline > 0 && $0 == "" ) ; do { if (match($0, "'"$timespec"'")) { line = substr($0, 1, RSTART - 1) print line } else { print $0 } } while ( getline > 0 && $0 != "" ) headerseen = 1 } ' | numberlist ;; cnet) ################################################################# # Current news from C|Net ################################################################# url="http://www.news.com/" echo " $url" urlget "$url" | tr -d ' ' | $NAWK ' /<ul[ ][ ]*id="editorsPicks"[^>]*>/ { newssection = 1 } /<\/ul>.*editor/ { newssection = 0 } newssection == 1 { if ($0 ~ /<h2[^>]*>/) { # Extract everything between <p>...</p> text = $0 sub(/^.*<p>/, "", text) gsub(/<\/p[^>]*>.*$/, "", text) # Extract everything between <h2>...</h2> title = $0 gsub(/^.*<h2[^>]*>/, "", title) gsub(/<\/h2[^>]*>.*$/, "", title) #print "X", text #print "T", title print title } } ' | striphtml | preproc | numberlist ;; scientist) ################################################################# # Headlines from the "New Scientist" ################################################################# url="http://www.newscientist.com/news.ns" echo " $url" today=`date "+%d %B %Y"` yesterday=`TZ=GMT+24 date "+%d %B %Y"` urlget "$url" | sed -n '/^[ ]*LATEST[ ]ARTICLES[ ]*$/,$p' | $NAWK ' /<div[ ][^>]*class=.*pnlTxt.*>/ { level = 1 while (getline > 0 && level >= 0) { print $0 if ($1 ~ /<\/div>/) { level-- } else if ($1 ~ /<div[^>]*>/) { level++ } } }' | html2iso | striphtml | preproc | egrep -v '^([A-Z \t0-9][A-Z \t0-9]*:)$' | egrep -v '([0-9][0-9]*[ ]*comment)|(- updated)' | $NAWK ' { line [i%3] = $0 if ( i % 3 == 2 ) { head = line[0] date = line[1] text = line[2] if (date ~ /[0-9][0-9]*:[0-9][0-9]*.*'"$today"'/ \ || date ~ \ /[0-9][0-9]*:[0-9][0-9]*.*'"$yesterday"'/ \ ) { #print "head=", head #print "text=", text #print "date=", date print head, "-", text } else { #print date " <> " "'"$today"'" } } ++i }' | numberlist ;; nbc) ################################################################# # Current news from NBC -- NOT USED ################################################################# url="http://www.msnbc.com/news/news_front.asp" echo " $url" urlget "$url" | striphtml | sed -n '/TOP[ ]*STORIES/,$p' | sed -n 's/^ \(.*\)/\1/p' | numberlist ;; slashdot | /.) ################################################################# # Current news from SlashDot ################################################################# url="http://slashdot.org/" echo " $url" today=`date "+%B %d"` urlget "$url" | $NAWK ' /<.*class="datitle".*'"$today"'/ { gsub(/class="date.*/, ">"); print } ' | striphtml | preproc | numberlist ;; spiegel) ################################################################# # Current news from "Spiegel Online" ################################################################# url="http://www.spiegel.de/schlagzeilen/tops/" echo " $url" todayexp=`date '+%d\.%m\.%Y'` urlget "$url" | sed 's/<[bB][rR][^>]*>/\n/g' | html2iso | striphtml | $NAWK ' BEGIN { maxlines = 3 } # keep last three lines $0 != "" { n = (n + 1) % maxlines # save last three lines line = $0 # Remove everything starting with (including) the date # part, e.g. "(24.11.2006) [forum]" if (line ~ /[ ]*\([^(]*$/) { sub(/[ ]\([^(]*$/, "", line) } lines[n] = line #print "!", n, lines[n] } /\('"$todayexp"'\)/ { n = (n + 1) % maxlines line = "" for (i = 0; i < maxlines-1; i++) { idx = (n + i) % maxlines if (i == 1) { line = line " - " } else if (i == 2) { line = line " " } else if (i > 0) { line = line " " } line = line lines[idx] } print line } ' | numberlist ;; *) fatal "invalid news source: $src valid: $newssources" ;; esac done exit 0