Exercise: HTML content
- Find a package that can strip the html tags from an html file and install it.
- If you found the HTML::Strip perl module, then the script bellow will do the work using that module.
- Save the home page of IMDB, strip the html and then count how many numbers are in the file.
examples/linux/htmlstrip.pl
#!/usr/bin/env perl use strict; use warnings; use Getopt::Long qw(GetOptions); use HTML::Strip; GetOptions('help' => \&usage) or usage(); if (@ARGV) { foreach my $file (@ARGV) { my $content; if (open my $fh, '<', $file) { local $/ = undef; $content = <$fh>; } else { warn "Could not open '$file'"; next; } if ($content) { strip($content); } } } else { my $content = join '', <STDIN>; strip($content); } sub strip { my ($raw_html) = @_; my $hs = HTML::Strip->new(); my $clean_text = $hs->parse( $raw_html ); $hs->eof; print $clean_text; } sub usage { print <<"USAGE"; Usage: $0 filename cat file | $0 USAGE exit; }