# $Id: wwwurl.pl,v 0.15 1994/09/21 01:23:18 fielding Exp $ # --------------------------------------------------------------------------- # wwwurl: A package for parsing and manipulating World-Wide Web # Uniform Resource Locators (URL). # # This package has been developed by Roy Fielding # as part of the Arcadia project at the University of California, Irvine. # It is distributed under the Artistic License (included with your Perl # distribution files). # # 13 Jun 1994 (RTF): Initial version # 17 Jun 1994 (RTF): Fixed double-relative URL handling (e.g. ../../) # 06 Jul 1994 (RTF): Fixed bug parsing URLs with an empty path and added # fallback code for undefined associative array entries. # Replaced complicated unescape loop with a simple # substitute (from Steven E. Brenner via Brooks Cutter). # Added escape() routine (w/mods) from Brooks Cutter. # 16 Jul 1994 (RTF): Added get_site() routine. # 27 Jul 1994 (RTF): Firmed-up algorithm for parsing relative URLs, fixing # several potential (but unlikely) bugs in the process. # Removed any hint of "URL:" prefix. # 17 Sep 1994 (RTF): Renamed parsing sets to match IETF draft and changed # how they are tested to use bitmap masks; # Modified parsing algorithm to use new sets; # Parser now handles URLs like http://host:/ and uses # the leftmost "?" as the start of query info; # Added caching of base URL components so that they don't # get re-parsed for every URL in a document. # Allowed lowercase hex digits in unescape. # # If you have any suggestions, bug reports, fixes, or enhancements, # send them to the libwww-perl mailing list at . # --------------------------------------------------------------------------- package wwwurl; %DefPort = ( # Define the default ports for major net services 'ftp', 21, 'file', 0, # note: non-local file URLs are changed to ftp URLs 'telnet', 23, 'whois', 43, 'gopher', 70, 'finger', 79, 'http', 80, 'nntp', 119, 'news', 119, 'wais', 210, 'webster', 765, 'prospero', 1525, # I thought it was 191, but IETF differs ); # =========================================================================== # The following six categories are bitmap masks for determining membership # in the corresponding URL syntactic set, as per the IETF/URI working group # draft specification for Relative URLs . $UsesRelative = 1; $UsesNetloc = 2; $NonHierarchical = 4; $UsesParams = 8; $UsesQuery = 16; $UsesFragment = 32; %InSet = ( # Define scheme membership in each category '', ($UsesRelative | $UsesNetloc | $UsesFragment | $UsesQuery), 'http', ($UsesRelative | $UsesNetloc | $UsesFragment | $UsesQuery), 'file', ($UsesRelative | $UsesNetloc | $UsesFragment), 'ftp', ($UsesRelative | $UsesNetloc | $UsesFragment | $UsesParams), 'prospero', ($UsesRelative | $UsesNetloc | $UsesFragment | $UsesParams), 'nntp', ($UsesRelative | $UsesNetloc | $UsesFragment), 'gopher', ($UsesRelative | $UsesNetloc | $NonHierarchical | $UsesFragment), 'wais', ($UsesRelative | $UsesNetloc | $NonHierarchical | $UsesQuery | $UsesFragment), 'mailto', ($NonHierarchical), 'news', ($NonHierarchical | $UsesFragment), 'finger', ($UsesNetloc | $NonHierarchical | $UsesFragment), 'whois', ($UsesNetloc | $NonHierarchical | $UsesFragment), 'webster', ($UsesNetloc | $NonHierarchical | $UsesFragment), 'telnet', ($UsesNetloc | $NonHierarchical), 'rlogin', ($UsesNetloc | $NonHierarchical), 'tn3270', ($UsesNetloc | $NonHierarchical), ); # =========================================================================== # The following package globals are used to cache the last Base URL parsed. $Burl = ''; $Bsch = ''; $Baddr = ''; $Bport = ''; $Bpath = ''; $Bquery = ''; $Bfrag = ''; $Bmem = 0; # =========================================================================== # =========================================================================== # parse(): Parse the given URL into its component parts according to # WWW URI rules, returning '' for those that are not present. # If no scheme is given, the URL is parsed according to HTTP rules, # so schemes which use different rules may have to recombine parts. # # Returns the folowing in order: # # $scheme : The access scheme (converted to lower case); # $address: The login or hostname/IP address (if appropriate); # $port : The TCP port (if appropriate); # $path : The object path (plus any params); # $query : The post-'?' search info (if scheme uses queries); # $frag : The post-'#' fragment identifier (if uses fragments). # sub parse { local($url) = @_; local($scheme) = ''; local($address) = ''; local($port) = ''; local($path) = ''; local($query) = ''; local($frag) = ''; if ($url =~ s#^([.+\-\w]+):##) { $scheme = $1; $scheme =~ tr/A-Z/a-z/; } local($member) = $InSet{$scheme} || 0; if ($member & $UsesFragment) { if ($url =~ s/#([^#]*)$//) { $frag = $1; } } if (($member & $UsesNetloc) && ($url =~ m#^//#o)) { $url =~ s#^//([^/]*)##; $address = $1; if ($address =~ s/:(\d*)$//) { $port = $1; } } if ($member & $UsesQuery) { if ($url =~ s/\?(.*)$//) { $query = $1; } } $path = $url; return ($scheme, $address, $port, $path, $query, $frag); } # =========================================================================== # compose(): Recombine the given component parts into a URL string. # # The following in components may be passed in: # # $scheme : The access scheme; # $address: The hostname/IP address; # $port : The TCP port; # $path : The object path (plus any params); # $query : The post-? search info # $frag : The post-'#' fragment identifier # sub compose { local($scheme, $address, $port, $path, $query, $frag) = @_; local($url) = ''; if ($scheme) { $url = $scheme . ':'; } if ($address) { $url .= "//$address"; if ($port) { $url .= ":$port"; } } if ($path) { $url .= $path; } if ($query) { if (!$path) { $url .= '/'; } # Avoid mistaking query as being $url .= "?$query"; # part of the address } if ($frag) { $url .= "#$frag"; } return $url; } # =========================================================================== # unescape(): Return the passed URL after replacing all %NN escaped chars # with their actual character equivalents. # sub unescape { local($url) = @_; $url =~ s/%([\dA-Fa-f][\dA-Fa-f])/pack("C",hex($1))/ge; return $url; } # =========================================================================== # escape(): Return the passed string after replacing all characters matching # the passed pattern with their %XX hex escape chars. Note that # the caller must be sure not to escape reserved URL characters # (e.g. / in pathnames, ':' between address and port, etc.) and thus # this routine can only be applied to each URL part separately. E.g. # # $escname = &escape($name,'[\x00-\x20"#%/;<>?\x7F-\xFF]'); # sub escape { local($str, $pat) = @_; $str =~ s/($pat)/sprintf("%%%02lx",unpack('C',$1))/ge; return($str); } # =========================================================================== # absolute(): Return the absolute URL given a (possibly relative) URL # and the document's absolute base URL. Uses the $B* variables # to cache the last Base URL parsed. # sub absolute { local($base, $url) = @_; $url =~ s/^\s+//; # Remove any preceding whitespace $url =~ s/\s.*//; # Remove anything after first word local($scheme, $addr, $port, $path, $query, $frag) = &parse($url); local($member) = $InSet{$scheme} || 0; RELATED: { if (!$base) # If no base was given then it can't be relative { if (!$scheme) { $scheme = 'file' } # Default to a file URL last RELATED; } if ($base ne $Burl) # Check the Base URL cache { $Burl = $base; ($Bsch,$Baddr,$Bport,$Bpath,$Bquery,$Bfrag) = &parse($Burl); $Bmem = $InSet{$Bsch} || 0; } if (!$scheme) { $scheme = $Bsch; $member = $Bmem; if ($query && !($Bmem & $UsesQuery)) # Restore mistaken queries { $path .= '?'. $query; $query = ''; } } else { last RELATED if ($scheme ne $Bsch); } last RELATED unless ($member & $UsesRelative); last RELATED if ($addr || $port); # Child must have used '//' $addr = $Baddr; # else it inherits base netloc $port = $Bport; if (!$path) { $path = $Bpath; if (!$query) { $query = $Bquery; } } elsif ($member & $NonHierarchical) {;} # Do nothing elsif ($path !~ m|^/|o) # If the child URL does not begin with '/' { local($ppath); if ($Bpath) { $ppath = $Bpath; if ($Bmem & $UsesParams) { $ppath =~ s#;.*##; # Trim off any base parameters } $ppath =~ s#/[^/]*$#/#; # Trim off any base filename } else { $ppath = '/'; } $path = $ppath . $path; # # The order in which we remove the relative "/." and "xxx/.." # path segment components is extremely important. # while ($path =~ s#/\./#/#) {;} $path =~ s#/\.$#/#; while ($path =~ s#/[^/]*/\.\./#/#) {;} $path =~ s#/[^/]*/\.\.$#/#; } } if ($scheme eq 'file') { if (!$addr) { $addr = 'localhost'; } elsif ($addr ne 'localhost') { $scheme = 'ftp'; } # # The above line will have to be deleted once people stop using # file: as an alias for ftp: (i.e. when the IETF standard is done). # } elsif ($scheme eq 'http') { $path =~ s#^/\%7E#/~#io; } # NOTE: Fanatical spec-followers should reverse the above substitution # because it improperly prefers the tilde character over %7E (:-b) if ($port && ($port == $DefPort{$scheme})) { $port = ''; } if (!$path) { $path = '/'; } return &compose($scheme, $addr, $port, $path, $query, $frag); } # =========================================================================== # get_site(): Return the site part of the passed-in absolute URL # (i.e. the hostname:port) replacing any missing port with # the default. If the URL scheme does not allow hostport, # then we return ''; # sub get_site { local($scheme, $site, $port, $path, $query, $frag) = &parse($_[0]); return '' unless (defined($DefPort{$scheme})); if (!$port) { $port = $DefPort{$scheme}; } if ($port) { $site .= ":$port"; } return $site; } # =========================================================================== 1;