#!/bin/sh
# showlink
# 19991012 <verde (a) aurelio net> ** 1st version
#
# a bash+find+egrep+sed utility to webdesigners.
# showlink shows all kind of links on a web site.
# it gets all *html pages above the current dir.
# you can parse its output to find lost links, do
# statistics, find errors, or just for curiosity &:)
#
# a quick example:
# cp showlink /bin ; cd /home/httpd/html ; showlink -a
Usage(){ echo "$0 OPTION (just one option)
-a,--all all links found
-c,--chk check for incorrect links
-d,--dir links that point to directories
-e,--email links that point do emails (mailto:)
-f,--ftp links that point do external pages (ftp://)
-h,--http links that point do external pages (http://)
-i,--image links that point do images
-m,--music links that point do music files
-p,--page links on the same page
-z,--zzz links marked with ??? (todo link)
--help this help
UPPERCASE single option (-[ACDEFHIMPZ]) give the output without the
filename. i.e. \"`basename $0` -A\"
" ; exit 1
}
[ "$1" ] || Usage
OPT=$1
### the defaults for full output - do NOT edit!
# the delimiter beetween the filename and the link found
D=':'
# options passed to the egrep command. this is related to
# case of the option passed on the command line. lowercase
# is the default: show filename (H option). uppercase
# supresses it.
EGREP_OPTS='-His'
# the egrep pattern is only 'href=' and not '<a[^I ]\+href='
# or something to support multiline like <a
# href="link.html", and also, the "" are not included to you
# see when you missed them on the html source (all the line
# will appear on the output).
EGREP_PATT='href='
# this filter parses the egrep output to only extract the
# link itself: the string beetween the "".
EGREP_FILTER="s§\(^[^:]*\):.*href=\"[ ]*\([^ \"]*\)[ ]*\".*§\1$D\2§"
# this filter is the action related to the option passed on
# the command line. here is the "sedmagic". 'p' is the default,
# and show all links. the true magics are on the 'case' below.
SEDMAGIC='p'
# let's see if we have a UPPER option and set the filters to it
OPT2=`echo ${OPT#-} | sed 'y/ACDEFHIMPZ/acdefhimpz/'`
if [ "${OPT#-}" != "$OPT2" ]
then OPT=-$OPT2
D='^'
EGREP_OPTS='-is'
EGREP_FILTER="s§^.*href=\"[ ]*\([^ \"]*\)[ ]*\".*§\1§"
fi
# sedmagics here!
case $OPT in
-a|--all ) : ;; # it uses the defaults
-c|--chk ) SEDMAGIC="\§${D}/[^/]§p" ;; # TODO: more checks
-d|--dir ) SEDMAGIC="\§${D}[^.#]\+$§p" ;;
-e|--email) SEDMAGIC="\§${D}mailto:§p" ;;
-f|--ftp ) SEDMAGIC="\§${D}ftp://§p" ;;
-h|--http ) SEDMAGIC="\§${D}http://§p" ;;
-i|--image) SEDMAGIC='\§\.\(jpg\|gif\|png\|bmp\)$§p';;
-m|--music) SEDMAGIC='\§\.\(ra\|rm\|mp3\|wav\)$§p' ;;
-p|--page ) SEDMAGIC='\§#§p' ;;
-z|--zero ) SEDMAGIC='\§???§p' ;;
*) Usage ;;
esac
FILES=`find . -type f -name "*html"`
# and after all this preparations, the brain is here
# ah! no support for spaced filenames, but who cares?
for F in $FILES
do egrep $EGREP_OPTS $EGREP_PATT $F |
sed "$EGREP_FILTER" |
sed -n "$SEDMAGIC"
done