Exercise: HTML content

Find a package that can strip the html tags from an html file and install it.
If you found the HTML::Strip perl module, then the script bellow will do the work using that module.
Save the home page of IMDB, strip the html and then count how many numbers are in the file.

examples/linux/htmlstrip.pl

#!/usr/bin/env perl
use strict;
use warnings;

use Getopt::Long qw(GetOptions);
use HTML::Strip;

GetOptions('help' => \&usage) or usage();

if (@ARGV) {
    foreach my $file (@ARGV) {
        my $content;
        if (open my $fh, '<', $file) {
            local $/ = undef;
            $content = <$fh>;
        } else {
            warn "Could not open '$file'";
            next;
        }
        if ($content) {
            strip($content);
        }
    }
} else {
    my $content = join '', <STDIN>;
    strip($content);
}

sub strip {
    my ($raw_html) = @_;
    my $hs = HTML::Strip->new();
    my $clean_text = $hs->parse( $raw_html );
    $hs->eof;
    print $clean_text;
}

sub usage {
    print <<"USAGE";
Usage:
    $0 filename
    cat file | $0
USAGE
    exit;
}