#!/bin/sh
#
# $Header: /CVSROOT/tinohtmlparse/tinohtmlabsurl.sh,v 1.6 2009-07-12 22:19:44 tino Exp $
#
# THIS IS NOT PERFECT,
# it does not process a lot of URI attributes for now,
# like applet_codebase, applet_code etc.
#
# This Works is placed under the terms of the Copyright Less License,
# see file COPYRIGHT.CLL. USE AT OWN RISK, ABSOLUTELY NO WARRANTY.
#
# $Log: tinohtmlabsurl.sh,v $
# Revision 1.6 2009-07-12 22:19:44 tino
# Url-Parser fixed in tinohtmlabsurl.sh
#
# Revision 1.5 2007-12-30 17:57:03 tino
# Placed under the CLL, also one entity code was fixed (∧)
#
# Revision 1.4 2007-02-12 07:00:02 tino
# Commit for dist, see ChangeLog
#
# Revision 1.3 2006/06/16 20:35:41 tino
# Jump now name anchor, which can be understood better.
# Capability to send all non-URL-lines elsewhere, if you concentrate on URLs
#
# Revision 1.2 2005/02/06 00:17:06 tino
# Only full lines are fed to the parser to make output more easy to parse.
#
# Revision 1.1 2005/02/05 23:07:28 tino
# first commit, tinohtmlparse.c is missing "text" aggregation
if [ 1 != "$#" -a 2 != "$#" ]
then
echo "Usage: tinohtmlparse | `basename "$0"` BASEURL [/dev/stdout]
Second argument gives file for non-URL lines" >&2
exit 1
fi
awk -v BASE="$1" -v NOURLOUT="${2:-/dev/stdout}" '
function shift(x)
{
sub(/^[^[:space:]]*[[:space:]]/,"",x);
return x;
}
# http://user:pass@hostname/path/to/index.html?var=data#text
# -type-!!-user---!!-host-!!-path--!!-file---!!-query-!!-anchor-
#
# relative URLs: path does not start with /
# Special characters:
# # starts the anchor anywhere in the URL
# ? starts the query part anywhere in the URL
# @ must preceede host, which must precede /
#
# It does not assume directory structure:
# .. becomes path "" name ".."
# ../.. becomes path "../" name ".."
function parseuri(u, b, c)
{
delete parsed
# fetch #anchor
b = u;
gsub(/^[^#]*/, "", b);
gsub(/#.*$/,"",u);
parsed["anchor"]=b
# Fetch ?query
b = u;
gsub(/^[^?]*/, "", b);
gsub(/\?.*$/,"",u);
parsed["query"]=b
# Fetch schema://
b = "";
if (match(u,/^[a-z]*:\/?\/?/))
{
b = substr(u,RSTART,RLENGTH);
u = substr(u,RSTART+RLENGTH);
}
parsed["type"]=b
# Fetch host if there is a schema
c = ""
if (b!="")
{
b = u
gsub(/\/.*$/,"",b);
gsub(/^[^/]*/,"",u);
# check user:password@
if (match(b,/@/))
{
c = substr(b,1,RSTART);
b = substr(b,RSTART+1);
}
}
parsed["user"]=c;
parsed["host"]=b;
# Now left is path/file
b = u
gsub(/^.*\//,"",b);
parsed["file"]=b;
gsub(/[^/]*$/,"",u);
parsed["path"]=u
}
function dump(t,a,s)
{
s = sprintf("# %s", t);
a="type"; s=s sprintf(" %s=\"%s\"", a, parsed[a]);
a="user"; s=s sprintf(" %s=\"%s\"", a, parsed[a]);
a="host"; s=s sprintf(" %s=\"%s\"", a, parsed[a]);
a="path"; s=s sprintf(" %s=\"%s\"", a, parsed[a]);
a="file"; s=s sprintf(" %s=\"%s\"", a, parsed[a]);
a="query"; s=s sprintf(" %s=\"%s\"", a, parsed[a]);
a="anchor"; s=s sprintf(" %s=\"%s\"", a, parsed[a]);
print s >NOURLOUT
}
function setbase(b)
{
parseuri(b)
# Make sure, BASE path starts and ends with /
# (the latter is the same as base["path"]=="")
if (parsed["path"]!~/^\// || parsed["path"]!~/\/$/)
parsed["path"]="/";
dump("base");
basetype = parsed["type"]
basehost = parsed["host"]
basepath = parsed["path"]
basefile = parsed["file"]
basequery = parsed["query"]
}
# Make a full URI from a relative one
# The idea is
function makefull(u,p)
{
parseuri(u)
dump("1");
# Bugfix:
# We have two options here:
# base: http://example.com/dir/file?query#anchor
# uri=: ?query
# Does the result have a file?
# Browsers think 'yes'
# This now shall parse #anchor type destinations, too.
if (parsed["type"]=="" && parsed["host"]=="" && parsed["path"]=="" && parsed["file"]=="" && parsed["query"]=="")
parsed["query"]=basequery;
if (parsed["type"]=="" && parsed["host"]=="" && parsed["path"]=="" && parsed["file"]=="")
parsed["file"]=basefile;
# Take over type, host and path from BASE URI
if (parsed["type"]=="")
parsed["type"]=basetype;
# Well, what to do when switching from http: to https:?
# news:whatever has no host!
if (parsed["host"]=="" && parsed["type"]==basetype)
parsed["host"]=basehost;
# This apparently only works when there was no host
if (parsed["path"]!~/^\//)
parsed["path"] = basepath parsed["path"]
dump("2");
return parsed["type"] parsed["user"] parsed["host"] parsed["path"] parsed["file"] parsed["query"] parsed["anchor"];
}
BEGIN {
tag["base","href"]=1
# I am conservative
# Do not touch background,href,src I do not understand.
tag["a","href"]=2
tag["area","href"]=2
tag["bgsound","src"]=2
tag["body","background"]=2
tag["embed","src"]=2
tag["form","action"]=2
tag["frame","src"]=2
tag["iframe","src"]=2
tag["input","src"]=2
tag["img","src"]=2
tag["layer","background"]=2
tag["layer","src"]=2
tag["link","href"]=2
tag["script","src"]=2
tag["table","background"]=2
tag["td","background"]=2
tag["th","background"]=2
setbase(BASE)
IGNORECASE=1
}
$1=="attr" && (t=tag[tolower($2),tolower($3)])!=0 {
uri=shift(shift(shift(shift($0))))
if (t==1)
setbase(uri)
else
{
n=makefull(uri)
if (uri!=n)
{
print "# " $0 > NOURLOUT
print $1 " " $2 " " $3 " " $4 " " n
next
}
print
next
}
}
{ print >NOURLOUT }
'