:
##########################################################################
# Title : dailynews - get daily news messages
# Author : Heiner Steven
# Date : 1999-12-03
# Requires : dumphtmltbl, html2iso, recode, striphtml, wget, wordwrap
# Category : WWW, Desktop
# SCCS-Id. : @(#) dailynews 1.70 18/03/27
##########################################################################
# Description
#
##########################################################################
PN=`basename "$0"` # Program name
VER='1.70'
##########################################################################
newssources="zdf heise spiegel sueddeutsche zdnet cnet scientist nature slashdot"
NEWSCNT=20 # max. number of head articles
##########################################################################
# Get HTML page by URL, write to stdout
unset GETURL GETURLFLAGS
: ${GETURL:=wget}
: ${GETURLFLAGS="-q -O-"}
: ${EGREP:=egrep}
usage () {
echo >&2 "$PN - get daily news headlines, $VER (stv '99)
usage: $PN [-n maxarticles] [newssource ...]
-n: limit the number of articles (per news source), default: $NEWSCNT
If no news source was specified, all will be consulted.
Valid news sources:
$newssources"
exit 1
}
msg () {
for msgLine
do echo "$PN: $msgLine" >&2
done
}
fatal () { msg "$@"; exit 1; }
##########################################################################
# Formatting helper functions - may need adjustments
##########################################################################
# urlget - retrieve data from the web by URL, print to standard output
urlget () {
$GETURL $GETURLFLAGS "$@"
}
# numberlist - indent and number lines
numberlist () {
cat "$@" |
nl -s'. ' | # number lines, number separator is '. '
sed "$maxlines{s|^.*$| [more...]|;q;}" | # limit max. number
wordwrap -o 9
}
# Convert UTF-8-encoding to ISO 8859-15 encoding
convert_utf8_to_iso () {
recode -f utf8..latin1 "$@"
}
# "canonical" white space - replace multiple blanks with exactly one blank
canonws () {
sed 's/ / /g;s/ */ /g' "$@"
}
# trim - remove leading or trailing whitespace characters of a line
trim () {
sed 's/^[
]*//; s/[
]*$//' "$@"
}
rmemptylines () {
$EGREP -v "^[
]*$" "$@"
}
jointagline () {
$NAWK '
{
s = $0; gsub (/[^<]*/, "", s)
nopen += length (s);
s = $0; gsub (/[^>]*/, "", s)
nclose += length (s);
if ( nopen && nopen == nclose ) {
lastopen = match ($0, "<")
lastclose = match ($0, ">")
multiline = (lastopen > lastclose)
}
#print nopen, nclose, $0 | "cat >&2"
if ( nopen == nclose && !multiline ) {
print
#print "o", nopen, nclose, $0 | "cat >&2"
nopen = nclose = 0
} else {
#print "+", nopen, nclose, $0 | "cat >&2"
printf "%s", $0
multiline = 1
}
}
END {
if ( multiline ) print ""
}
' "$@"
}
preproc () {
canonws "$@" |
trim |
rmemptylines
}
# extract_rsstitles - extract lines from a RSS feed
extract_rsstitles () {
: ${todayexp?}
tr -d '
' |
$NAWK '
{ # Add line-feed before each opening tag
gsub(/<[^\/]/, "\n&");
print
#print "1<", $0 | "cat >&2"
}
' |
$NAWK '
function extract_cdata(v) {
#
gsub(//, "", v)
return v
}
#{ print "2<", $0 | "cat >&2"; }
/<[iI][tT][eE][mM][^>]*>/ { readitem = 1 }
readitem {
# Read title on one or multiple lines
title_end = "<\/[tT][iI][tT][lL][eE][^>]*>"
if ($0 ~ /<[tT][iI][tT][lL][eE][^>]*>.*<\/\[tT][iI][tT][lL][eE][^>]*>/ ) {
#print "title in one line:", $0 | "cat >&2"
title = $0
} else if ( $0 ~ /<[tT][iI][tT][lL][eE][^>]*>/ ) {
#print "title in more than one line:", $0 | "cat >&2"
title = extract_cdata($0)
do {
if ( getline != 1 ) break
title = title " " extract_cdata($0)
} while (!match(title, title_end))
#print "next line title", title | "cat >&2"
}
if (match($0, "\<[pP][uU][bB][dD][aA][tT][eE]\>.*" \
todayexp)) {
datevalid = 1
}
# Example: "2013-05-02"
if (match($0, "\<[a-zA-Z][a-zA-Z0-9]*:date\>.*" \
todayexp)) {
datevalid = 1
}
}
##{ print "datevalid=" datevalid | "cat >&2"; }
/<\/[iI][tT][eE][mM]>/ {
if (title != "" && datevalid) {
print title
}
title = ""
readitem = 0
datevalid = 0
}
'
}
###############################################################################
outputvalid () {
$NAWK '
BEGIN { dataseen = 0 }
{
print
dataseen = 1
}
END {
exit(dataseen ? 0 : 1)
}
'
}
###############################################################################
# searchprog - search program using search PATH
# usage: searchprog program
###############################################################################
searchprog () {
_search=$1; shift
for _dir in `echo "$PATH" | sed "s/^:/.:/;s/:\$/:./;s/:/ /g"`
do
[ -x "$_dir/$_search" ] || continue
echo "$_dir/$_search"
return 0
done
return 1
}
###############################################################################
# MAIN PROGRAM
##############################################################################
# We need a "new" NAWK implementation with functions, "getline()",
# gsub()
: ${NAWK:=`searchprog mawk || searchprog gawk || searchprog nawk || echo awk`}
set -- `getopt hn: "$@"` || usage
[ $# -lt 1 ] && usage # "getopt" detected an error
while [ $# -gt 0 ]
do
case "$1" in
-n)
NewsCnt=$2; shift
case "$NewsCnt" in
*[!0-9]*) fatal "invalid number: $NewsCnt";;
esac
;;
--) shift; break;;
-h) usage;;
-*) usage;;
*) break;; # First file name
esac
shift
done
[ $# -lt 1 ] && set -- $newssources
maxlines=${NewsCnt:-$NEWSCNT}
maxlines=`expr "$maxlines" + 1`
: ${maxlines:=$NEWSCNT}
isodate=`date +%Y-%m-%d`
LongDate=`date '+%d.%m.%Y'`
echo "News $isodate"
# Check if there were at least one valid message, and set exit code
# accordingly
validmessage=false
error=0
for src
do
case "$src" in
zdf)
#################################################################
# Headlines from the "Zweites Deutsches Fernsehen" ZDF
#################################################################
url="https://www.zdf.de/rss/zdf/nachrichten"
echo "
$url"
day=`date "+%d" | sed 's/^0*//'`
month=`date "+%b"`
year=`date "+%Y"`
date="$day.*$month.*$year"
urlget "$url" |
$NAWK '
/<[tT][Ii][tT][Ll][eE]>/ { title = $0 }
/<[dD][eE][sS][cC][rR][iI][pP][tT][iI][oO][nN]>/ { desc = $0 }
/<[pP][uU][bB][dD][aA][tT][eE]>.*'"$date"'/ {
if ( skippedheader )
print title, "-", desc
skippedheader = 1
}
' |
html2iso |
striphtml |
preproc |
numberlist |
outputvalid && validmessage=true
;;
heise)
#################################################################
# Current news from the "Heise" publishing company
#################################################################
url="http://www.heise.de/newsticker/"
echo "
$url"
date=`date "+%Y-%m-%d"`
urlget "$url" | tee tmp.1 |
$NAWK '
/
>|&\n|g' \
-e 's|]*>|\n&|g' \
-e 's|]*>|\n&|g' \
-e 's|>|\n&|g' \
-e 's|
>|&\n|g' |
$NAWK '
$0 ~ /section="topStories"/ {
title = $0
print title
}
$0 ~ /datestamp.*'"$datepattern"'/ {
title = prev
print title
}
{ prev = $0 }
' |
sed 's/<[^>]*>//g' | # "striphtml", but no
handling
preproc |
numberlist |
outputvalid && validmessage=true
;;
scientist)
#################################################################
# Headlines from the "New Scientist"
#################################################################
url="http://feeds.newscientist.com/science-news"
echo "
$url"
today=`date "+%d %B %Y"`
yesterday=`TZ=GMT+24 date "+%d %B %Y"`
todayexp="$today|$yesterday"
urlget "$url" |
extract_rsstitles |
striphtml |
convert_utf8_to_iso |
preproc |
numberlist |
outputvalid && validmessage=true
;;
nature)
#################################################################
# Headlines from the "Nature"
#################################################################
url="http://feeds.nature.com/nature/rss/current?format=xml"
echo "
$url"
today=`TZ=GMT date "+%d %B %Y"`
today_iso=`TZ=GMT date "+%Y-%m-%d"`
#yesterday=`TZ=GMT+24 date "+%d %B %Y"`
#yesterday_iso=`TZ=GMT+24 date "+%Y-%m-%d"`
#todayexp="($today|$yesterday|$today_iso|$yesterday_iso)"
todayexp="($today|$today_iso)"
urlget "$url" |
extract_rsstitles |
html2iso |
striphtml |
preproc |
numberlist |
outputvalid && validmessage=true
;;
nbc)
#################################################################
# Current news from NBC -- NOT USED
#################################################################
url="http://www.msnbc.com/news/news_front.asp"
echo "
$url"
urlget "$url" |
striphtml |
sed -n '/TOP[ ]*STORIES/,$p' |
sed -n 's/^ \(.*\)/\1/p' |
numberlist |
outputvalid && validmessage=true
;;
slashdot | /.)
#################################################################
# Current news from SlashDot
#################################################################
url="http://rss.slashdot.org/Slashdot/slashdot"
echo "
$url"
todayexp=`LANG=C date '+%d %b %Y'`
urlget "$url" |
extract_rsstitles |
html2iso |
striphtml |
preproc |
numberlist |
outputvalid && validmessage=true
;;
spiegel)
#################################################################
# Current news from "Spiegel Online"
#################################################################
url="http://www.spiegel.de/schlagzeilen/tops/index.rss"
echo "
$url"
todayexp=`LANG=C date '+%d %b %Y'`
urlget "$url" |
extract_rsstitles |
striphtml |
convert_utf8_to_iso |
preproc |
numberlist |
outputvalid && validmessage=true
;;
sueddeutsche)
#################################################################
# Current news from "Sueddeutsche Zeitung"
#################################################################
#url="http://rss.feedsportal.com/795/f/449002/index.rss"
#url="http://rssfeed.sueddeutsche.de/c/795/f/449002/index.rss"
url="http://rss.sueddeutsche.de/rss/Topthemen"
echo "
$url"
todayexp=`LANG=C date '+%d %b %Y'`
urlget "$url" |
extract_rsstitles |
convert_utf8_to_iso |
striphtml |
preproc |
numberlist |
outputvalid && validmessage=true
;;
*)
fatal "invalid news source: $src
valid: $newssources"
;;
esac
done
if [ "$validmessage" != "true" ]
then
msg "ERROR: no news output line"
error=1
fi
exit $error