#! /usr/local/bin/perl -w
# $Id: urlgrep.pl,v 1.5 1998/02/10 19:56:56 user Exp $
#
# list all embedded URLs in plaintext, being
# careful of trailing punctuation, like in this line:
# Visit http://www.xor.com/. Maybe http://internet-plaza.net/?

require 5.002;  # not imperative

# cannot use IO::Handle
use FileHandle;
ARGV->input_record_separator('');  # for paragraph reads

$urls = '(' 
      . join('|', qw{
		http
		ftp
		file
		telnet
		gopher
		mailto
		about
		wais
	    } ) 
      . ')';

$ltrs = '\w';
$gunk = '/#~:.?+=&%@!\-';
$punc = '.:?\-';
$any  = "$ltrs$gunk$punc";

while ($_ = ARGV->getline()) {
  while (m{
	\b			    # start at word boundary
	( 			    # beginning of $1 catch buffer
	  $urls     : 	    	    # need resource and a literal colon
	  [$any] +? 		    # followed by one or more
				    #  	of any valid character, but
				    #  	be conservative and take only
				    #  	what you need to using +?
	)			    # end of $1 catch buffer
	(?=			    # look-ahead non-consumptive assertion (?=
		[$punc]* 	    # either 0 or more punctuation
		[^$any] 	    #   followed by a non-url char
	    | 			    # or else
		$ 	            #   then end of the string
	)
   }igox)
				    # /i means case-insensitive
				    # /g means do the substitute globally
				    # /o is a hack to avoid extra regcomps
				    #    for the interpolated variables 
				    # /x is for embedded comments and whitespc
				    # 
				    # other cool switches include
				    #	/s	make . also match newlines
				    #	/m	make ^ and $ multiline match
				    #	/e	RHS now full expr, not string:
				    #		    s/([0-9]+)/3 * $1 + 1/eg;
  { print "$1\n"; }
}

exit 0;
