1 #!/usr/bin/perl 2 3 ####################################################################### 4 # 5 # atomize_nyt.pl 6 # Marc A. Garrett/since1968.com 7 # 21 Apr 2006 8 # Gets most searched subjects from list on NYTimes home page and 9 # creates a well-formed atom feed. 10 # Code is based on Simon Cozens' Hack #24 in "Spidering Hacks". 11 # Written as part of Kunal Anand's XML Exam, Question 4. 12 # 13 ####################################################################### 14 15 use Template::Extract; 16 use LWP::Simple qw(get); 17 use Data::Dumper; 18 use XML::Atom::SimpleFeed; 19 use DateTime; 20 21 my $page = get("http://www.nytimes.com"); 22 $page or die "Couldn't retrieve page."; 23 # clean up blank lines, DOS line feeds, leading spaces 24 $page = join "\n", grep { /\S/ } split /\n/, $page; 25 $page =~ s/\r//g; 26 $page =~ s/^\s+//g; 27 28 # uncomment this line if you want to see the HTML before it's 29 # extracted via the template 30 # print $page; 31 32 my $x = Template::Extract->new; 33 my $template = <<'TEMPLATE'; 34 35 [% ... %] 36
37
    38 [% FOR records %] 39
  1. [% query %] 40 [% ... %] 41 [% END %] 42
43 [% ... %] 44
45 [% ... %] 46 47 TEMPLATE 48 49 my $data = $x->extract($template, $page); 50 51 # dump extract results 52 # uncomment the following line if you're not sure what the scraper is getting 53 # print Dumper( $data ); 54 55 my $dt = DateTime->now; 56 my $now = $dt->ymd . 'T' . $dt->hms . 'Z'; 57 58 # create feed object 59 my $atom = XML::Atom::SimpleFeed->new( 60 title => "NY Times Popular Searches", 61 link => "http://since1968.com/", 62 tagline => "This feed generated by scraping the NY Times home page." 63 ); 64 65 66 for (@{$data->{records}}){ 67 $atom->add_entry( 68 title => $_->{query}, 69 link => $_->{url}, 70 author => { name => "NY Times"} 71 ); 72 73 } 74 75 # print the feed 76 $atom->print;