############################################################################# # Sample code for "Creating a Web Crawler in 3 Steps" # # Written by Issac Goldstand - All rights reserved # # Mirimar Networks - http://www.mirimar.net/ # # Presented at YAPC::Israel::2005 # # # # This program is free software. It can be modified and/or redistributed # # under the same terms as Perl itelf, as long as this notice is kept # # # # The latest version of this file, and the original presentation, can be # # found at my personal website at http://www.beamartyr.net/ # ############################################################################# #!/usr/bin/perl use strict; use LWP::RobotUA; use HTML::Parser; use HTML::SimpleLinkExtor; my @urls; # List of URLs to visit my %authors; # First, create the user agent - MyBot/1.0 my $ua=LWP::RobotUA->new('AuthorBot/1.0','isaac@cpan.org'); $ua->delay(15/60); # 15 seconds delay $ua->use_sleep(1); # Sleep if delayed # Create parsers my $p=My::LinkParser->new; my $linkex=HTML::SimpleLinkExtor->new; # Initialize list of URLs $urls[0]="http://www.beamartyr.net/"; # Parse loop for (my $i=0;$i<10;$i++) { my $response=$ua->get(pop @urls); # Get HTTP response if ($response->is_success) { # If reponse is OK $p->reset; $p->parse($response->content); # Parse for author $p->eof; if ($p->state==1) { # If state is FOUND_AUTHOR $authors{$p->author}++; # then add author count } else { $authors{'Not Specified'}++; # otherwise add default count } $linkex->parse($response->content); # parse for links unshift @urls,$linkex->a; # and add links to queue } } # Print results print "Results:\n"; map {print "$_\t$authors{$_}\n"} keys %authors; ### Parser class package My::LinkParser; use base qw(HTML::Parser); # Define simple constants use constant START=>0; use constant GOT_NAME=>1; # Simple access methods sub state { return $_[0]->{STATE}; } sub author { return $_[0]->{AUTHOR}; } # Clear parser state sub reset { my $self=shift; undef $self->{AUTHOR}; $self->{STATE}=START; return 0; } # Parser hook sub start { my($self, $tagname, $attr, $attrseq, $origtext) = @_; if ($tagname eq "meta" && lc($attr->{name}) eq "author") { $self->{STATE}=GOT_NAME; $self->{AUTHOR}=$attr->{content}; } }