#!/usr/local/bin/perl # converts html files into ascii by just stripping anything between # < and > # written 4/21/96 by Michael Smith for WebGlimpse # # Added code to replace html codes for special chars with the # characters themselves. 12/19/98 --GB # # Also add space in place of space-producing HTML tags # 12/22/98 --GB $carry=0; while(){ $line = $_; if($carry==1){ # remove all until the first > next if($line!~s/[^>]*>//); # if we didn't do next, it succeeded -- reset carry $carry=0; } while($line=~s/(<[^\s>][^>]*>)/&addspace($1)/ge){}; if($line=~s/<[^\s>].*$//){ $carry=1; } $line = &fixspecial($line); print $line; } sub addspace () { $_ = shift; # Check for tags that should NOT return a space /(<\/?b>)|(<\/?i>)|(<\/?em>)|(<\/?font)|(<\/?strong)|(<\/?big)/i && return ''; /(<\/?sup)|(<\/?sub)|(<\/?u>)|(<\/?strike)|(<\/?style)/i && return ''; # Otherwise, put in a space return ' '; } sub fixspecial () { $_ = shift; s/\ / /g; s/\ / /g; s/\¡/¡/g; s/\¡/¡/g; s/\¢/¢/g; s/\¢/¢/g; s/\£/£/g; s/\£/£/g; s/\¤/¤/g; s/\¤/¤/g; s/\¥/¥/g; s/\¥/¥/g; s/\¦/¦/g; s/\¦/¦/g; s/\§/§/g; s/\§/§/g; s/\¨/¨/g; s/\¨/¨/g; s/\©/©/g; s/\©/©/g; s/\ª/ª/g; s/\ª/ª/g; s/\«/«/g; s/\«/«/g; s/\¬/¬/g; s/\¬/¬/g; s/\­/\\/g; s/\­/\\/g; s/\®/®/g; s/\®/®/g; s/\¯/¯/g; s/\¯/¯/g; s/\°/°/g; s/\°/°/g; s/\±/±/g; s/\±/±/g; s/\²/²/g; s/\²/²/g; s/\³/³/g; s/\³/³/g; s/\´/´/g; s/\´/´/g; s/\µ/µ/g; s/\µ/µ/g; s/\¶/¶/g; s/\¶/¶/g; s/\·/·/g; s/\·/·/g; s/\¸/¸/g; s/\¸/¸/g; s/\¹/¹/g; s/\¹/¹/g; s/\º/º/g; s/\º/º/g; s/\»/»/g; s/\»/»/g; s/\¼/¼/g; s/\¼/¼/g; s/\½/½/g; s/\½/½/g; s/\¾/¾/g; s/\¾/¾/g; s/\¿/¿/g; s/\¿/¿/g; s/\À/À/g; s/\À/À/g; s/\Á/Á/g; s/\Á/Á/g; s/\Â/Â/g; s/\ˆ/Â/g; s/\Ã/Ã/g; s/\Ã/Ã/g; s/\Ä/Ä/g; s/\Ä/Ä/g; s/\Å/Å/g; s/\˚/Å/g; s/\Æ/Æ/g; s/\Æ/Æ/g; s/\Ç/Ç/g; s/\Ç/Ç/g; s/\È/È/g; s/\È/È/g; s/\É/É/g; s/\É/É/g; s/\Ê/Ê/g; s/\Ê/Ê/g; s/\Ë/Ë/g; s/\Ë/Ë/g; s/\Ì/Ì/g; s/\Ì/Ì/g; s/\Í/Í/g; s/\Í/Í/g; s/\Î/Î/g; s/\Î/Î/g; s/\Ï/Ï/g; s/\Ï/Ï/g; s/\Ð/Ð/g; s/\Ð/Ð/g; s/\Ñ/Ñ/g; s/\Ñ/Ñ/g; s/\Ò/Ò/g; s/\Ò/Ò/g; s/\Ó/Ó/g; s/\Ó/Ó/g; s/\Ô/Ô/g; s/\Ô/Ô/g; s/\Õ/Õ/g; s/\Õ/Õ/g; s/\Ö/Ö/g; s/\Ö/Ö/g; s/\×/×/g; s/\×/×/g; s/\Ø/Ø/g; s/\Ø/Ø/g; s/\Ù/Ù/g; s/\Ù/Ù/g; s/\Ú/Ú/g; s/\Ú/Ú/g; s/\Û/Û/g; s/\Û/Û/g; s/\Ü/Ü/g; s/\Ü/Ü/g; s/\Ý/Ý/g; s/\Ý/Ý/g; s/\Þ/Þ/g; s/\Þ/Þ/g; s/\ß/ß/g; s/\ß/ß/g; s/\à/à/g; s/\à/à/g; s/\á/á/g; s/\á/á/g; s/\â/â/g; s/\â/â/g; s/\ã/ã/g; s/\ã/ã/g; s/\ä/ä/g; s/\ä/ä/g; s/\å/å/g; s/\å/å/g; s/\æ/æ/g; s/\æ/æ/g; s/\ç/ç/g; s/\ç/ç/g; s/\è/è/g; s/\è/è/g; s/\é/é/g; s/\é/é/g; s/\ê/ê/g; s/\ê/ê/g; s/\ë/ë/g; s/\ë/ë/g; s/\ì/ì/g; s/\ì/ì/g; s/\í/í/g; s/\í/í/g; s/\î/î/g; s/\î/î/g; s/\ï/ï/g; s/\ï/ï/g; s/\ð/ð/g; s/\&ieth;/ð/g; s/\ñ/ñ/g; s/\ñ/ñ/g; s/\ò/ò/g; s/\ò/ò/g; s/\ó/ó/g; s/\ó/ó/g; s/\ô/ô/g; s/\ô/ô/g; s/\õ/õ/g; s/\õ/õ/g; s/\ö/ö/g; s/\ö/ö/g; s/\÷/÷/g; s/\÷/÷/g; s/\ø/ø/g; s/\ø/ø/g; s/\ù/ù/g; s/\ù/ù/g; s/\ú/ú/g; s/\ú/ú/g; s/\û/û/g; s/\û/û/g; s/\ü/ü/g; s/\ü/ü/g; s/\ý/ý/g; s/\ý/ý/g; s/\þ/þ/g; s/\þ/þ/g; s/\ÿ/ÿ/g; s/\ÿ/ÿ/g; s/\"/"/g; s/\"/"/g; # Do the ampersand last, so it won't affect the other substitutions s/\&/\&/g; s/\&/\&/g; return $_; }