#!/usr/bin/perl -w #****************************************************************************** #* #* GELLYFISH SOFTWARE #* #* #****************************************************************************** #* #* PROGRAM : htbeauty.pl #* #* AUTHOR : JNS #* #* DESCRIPTION : HTML Beautify #* #***************************************************************************** #* #* $Log: htbeauty.pl,v $ #* Revision 1.4 2001/11/03 17:32:31 gellyfish #* * Fixed to play nice with XHTML #* * Wrap long attribute lists #* #* Revision 1.3 2001/11/03 15:19:10 gellyfish #* * Made strict safe #* * Removed globals #* #* Revision 1.2 1998/03/21 22:03:22 jns #* First safe version #* #* Revision 1.1 1998/03/21 17:42:49 jns #* Initial revision #* #* #*****************************************************************************/ package HTBeauty; use strict ; use Text::Wrap qw(wrap); use File::Basename; use vars qw(@ISA); use Getopt::Std; @ISA =qw(HTML::Parser); require HTML::Parser; my $VERSION = sprintf("%d.%02d",q$Revision: 1.4 $ =~ m/: (\d+)\.(\d+)/) ; my ($progname,$progpath,$progext) = fileparse($0,".pl"); $Text::Wrap::columns = 80; # tags which may not have a corresponding / tag ( P may but neednt ) my %unarytag = ( hr => 1, p => 1, meta => 1, br => 1, nobr => 1, img => 1 ); my %listtag = ( ol => 1, ul => 1, dl => 1 ); my %listitem = ( li => 1, dt => 1 ); my %opt; sub usage($); getopts('x',\%opt) || die usage($progname); die usage($progname) unless ($ARGV[0]); my $p = new HTBeauty; $p->{_pad} = 0; $p->{_seen} = {}; $p->{_indentlength} = 3; if ( defined $opt{'x'} ) { $p->{_xhtml}++ ; delete $unarytag{'p'}; } $p->parse_file($ARGV[0]) || die "parse_file failed: $! \n"; sub declaration($$) { my ($self,$decl) = @_; if ( $decl =~ /xhtml/i ) { $self->{_xhtml}++ ; delete $unarytag{'p'}; } print "\n"; } sub start($$$$$) { my ($self,$tag,$attr,$attrseq,$orig) = @_; my $closetag = '>'; if ( defined $self->{_xhtml} ) { delete $attr->{'/'}; @{$attrseq} = grep !/\//, @{$attrseq}; if ( $unarytag{$tag} ) { $closetag = ' />'; } } else { if ( exists $attr->{'/'} ) { die <ucase($tag); if ($tag eq $self->ucase("meta")) { return if ( $attr->{content} =~ /$progname/ ); } $self->{_seen}->{$tag}++; my $fulltag = $tag; my $no_attrs = @{$attrseq}; my $indent = " " x $self->{_pad}; my $attr_pad = $indent . " " . " " x length($tag); foreach my $attribute ( @{$attrseq} ) { my $fullattr = $self->ucase($attribute) . q%="% . $attr->{$attribute} . q%"%; $fulltag .= " " . $fullattr; if($no_attrs >2) { $fulltag .= "\n" . $attr_pad; } } $self->{_pad} -= $self->{_indentlength} if ( $listitem{$tag}); if ($self->{_seen}->{$self->ucase('pre')}) { print "<",$fulltag,"$closetag"; } else { print $indent . "<",$fulltag,"$closetag\n"; } if ($tag =~ m{HEAD}i ) { print $indent,$indent; my $meta = $self->ucase(qq{\n}; printf $meta,$progname,$VERSION; print $indent,$indent,"\n"; } $self->{_pad} += $self->{_indentlength} unless ($unarytag{lc($tag)}); $self->{_pad} += $self->{_indentlength} if ($listtag{lc($tag)}); } sub comment($$) { my ($self,$comment) = @_; if ($self->{_seen}->{$self->ucase("script")}) { print $comment; } } sub text($$) { my ($self,$text) = @_; if ($self->{_seen}->{$self->ucase('pre')} or $self->{_seen}->{$self->ucase('style')}) { print $text; } else { chomp($text); $text =~ s/^\s*//g; $text =~ s/\s*$//g; $text =~ s/\s+/ /g; my $indent = " " x $self->{_pad}; print wrap($indent,$indent, $text),"\n" if ($text); } } sub end($$) { my ($self,$tag) = @_; $tag = $self->ucase($tag); if ($self->{_seen}->{$tag}) { $self->{_pad} -= $self->{_indentlength} unless ($unarytag{lc($tag)}); $self->{_pad} -= $self->{_indentlength} if ($listtag{lc($tag)}); my $indent = " " x $self->{_pad}; $self->{_seen}->{$tag}--; if ($self->{_seen}->{$self->ucase("pre")}) { print ""; } else { print $indent . "\n"; } } } sub ucase { my ( $self, $text) = @_; unless ( defined $self->{_xhtml} or $self->{_lower} ) { $text = uc $text; } return $text; } sub usage($) { my ( $progname ) = @_; return < EOUSAGE } 1; __END__ =head1 NAME HTBeauty =head1 USAGE CfilenameE > =head1 DESCRIPTION HTBeauty is a rudimentary HTML beautifier. Its original purpose is to reformat HTML produced by certain programs that dont conform to its authors aesthetic views. The output HTML is indented and the tags and attributes made uppercase. Comments are removed at present except whilst within a ESCRIPTE block. =head1 CAVEATS As said above B will currently remove comments. Thus if used against a file created by MS FrontPage containing any 'bots' or extensions which manifest themselves in comments it may well trash your page. This is not considered a bug by the author. Preformatted text is not necessarily going to work as well as it should as of the current version. =head1 COPYRIGHT Copyright (c) 1998,2001 Jonathan Stowe. This is free software it may be distributed under the same terms as perl itself. No warranty express or implied is made for the functionality of this software. Use of this software is at the users own risk. =head1 AUTHOR Jonathan Stowe jns@gellyfish.com =cut