#!/bin/sh
# showlink
# 19991012 <verde (a) aurelio net> ** 1st version
#
# a bash+find+egrep+sed utility to webdesigners.
# showlink shows all kind of links on a web site.
# it gets all *html pages above the current dir.
# you can parse its output to find lost links, do
# statistics, find errors, or just for curiosity &:)
#
# a quick example:
# cp showlink /bin ; cd /home/httpd/html ; showlink -a

Usage(){ echo "$0 OPTION   (just one option)

    -a,--all      all links found
    -c,--chk      check for incorrect links
    -d,--dir      links that point to directories
    -e,--email    links that point do emails (mailto:)
    -f,--ftp      links that point do external pages (ftp://)
    -h,--http     links that point do external pages (http://)
    -i,--image    links that point do images
    -m,--music    links that point do music files
    -p,--page     links on the same page
    -z,--zzz      links marked with ??? (todo link)
       --help     this help

UPPERCASE single option (-[ACDEFHIMPZ]) give the output without the
filename. i.e. \"`basename $0` -A\"
" ; exit 1
}

[ "$1" ] || Usage
OPT=$1


### the defaults for full output - do NOT edit!

# the delimiter beetween the filename and the link found
D=':'

# options passed to the egrep command. this is related to 
# case of the option passed on the command line. lowercase
# is the default: show filename (H option). uppercase
# supresses it.
EGREP_OPTS='-His'

# the egrep pattern is only 'href=' and not '<a[^I ]\+href='
# or something to support multiline like <a
# href="link.html", and also, the "" are not included to you
# see when you missed them on the html source (all the line
# will appear on the output).
EGREP_PATT='href='

# this filter parses the egrep output to only extract the
# link itself: the string beetween the "".
EGREP_FILTER="s§\(^[^:]*\):.*href=\"[    ]*\([^ \"]*\)[  ]*\".*§\1$D\2§"

# this filter is the action related to the option passed on
# the command line. here is the "sedmagic". 'p' is the default,
# and show all links. the true magics are on the 'case' below.
SEDMAGIC='p'



# let's see if we have a UPPER option and set the filters to it
OPT2=`echo ${OPT#-} | sed 'y/ACDEFHIMPZ/acdefhimpz/'`
if [ "${OPT#-}" != "$OPT2" ]
then OPT=-$OPT2
     D='^' 
     EGREP_OPTS='-is' 
     EGREP_FILTER="s§^.*href=\"[     ]*\([^ \"]*\)[  ]*\".*§\1§"
fi


# sedmagics here!
case $OPT in
  -a|--all  ) :                                       ;; # it uses the defaults
  -c|--chk  ) SEDMAGIC="\§${D}/[^/]§p"                ;; # TODO: more checks
  -d|--dir  ) SEDMAGIC="\§${D}[^.#]\+$§p"             ;;
  -e|--email) SEDMAGIC="\§${D}mailto:§p"              ;;
  -f|--ftp  ) SEDMAGIC="\§${D}ftp://§p"               ;;
  -h|--http ) SEDMAGIC="\§${D}http://§p"              ;;
  -i|--image) SEDMAGIC='\§\.\(jpg\|gif\|png\|bmp\)$§p';;
  -m|--music) SEDMAGIC='\§\.\(ra\|rm\|mp3\|wav\)$§p'  ;;
  -p|--page ) SEDMAGIC='\§#§p'                        ;;
  -z|--zero ) SEDMAGIC='\§???§p'                      ;;
           *) Usage                                   ;;
esac

FILES=`find . -type f -name "*html"`


# and after all this preparations, the brain is here
# ah! no support for spaced filenames, but who cares?
for F in $FILES
do egrep $EGREP_OPTS $EGREP_PATT $F |
   sed "$EGREP_FILTER" |
   sed -n "$SEDMAGIC"
done