:
##########################################################################
# Title      :	striphtml - remove HTML tags from input
# Author     :	Heiner Steven <heiner.steven@odn.de>
# Date       :	1999-03-09
# Requires   :	
# Category   :  HTML, File Conversion
# SCCS-Id.   :	@(#) striphtml	1.5 04/07/05
##########################################################################
# Description
#	Removes all HTML tags from the input, handling multi-line
#	tags as well
##########################################################################

PN=`basename "$0"`			# Program name
VER='1.5'

Usage () {
    echo >&2 "$PN - strip HTML tags from input, $VER
usage: $PN [file ...]"
    exit 1
}

Msg () {
    for MsgLine
    do echo "$PN: $MsgLine" >&2
    done
}

Fatal () { Msg "$@"; exit 1; }

set -- `getopt h "$@"`
[ $# -lt 1 ] && Usage			# "getopt" detected an error

while [ $# -gt 0 ]
do
    case "$1" in
					# your flags here
	--)	shift; break;;
	-h)	Usage;;
	-*)	Usage;;
	*)	break;;			# First file name
    esac
    shift
done

# Transform the input the following way:
#    1. Replace "<BR>" tags with a newline (special handling)
#    2. Remove all tags between '<' and '>'. If a line still has
#	a '<' character, it is a multi-line tag. Join this line
#	with the next ('N'), and repeat removing tags (this code
#	was borrowed from a script by changyj@rtfiber.com.tw)
#    3. Replace &nbsp; character entities with a blank (special handling)
#    4. Remove character entities (i.e. "&copy;" or "&nbsp;")

sed -e 's:<[bB][rR]/*>:\
:g' \
    -e '/</{
	:loop
	s/<[^>]*>//g
	 /</{
	     N
	     b loop
	 }
     }' \
    -e 's:&[nN][bB][sS][pP];: :g' \
    -e 's:&..;::g; s:&...;::g; s:&....;::g'	\
    "$@"
