#!/usr/bin/perl -w # This script takes the URLs of several RSS feeds and creates two things: # 1) A single RSS 1.0 feed combining all (or some of) the items. # 2) An HTML file of all (or some of) the items. # v1.0 2004-12-31 # Phil Gyford, phil@gyford.com use strict; use LWP::Simple; use XML::RSS; use Date::Manip qw(ParseDate UnixDate); use POSIX qw(ceil); ###################################################### # CONFIG # The URLs of each feed we're going to fetch. # For each key in the hash there should be a corresponding # get_key() function, which should be called before we # create the RSS and HTML. See the existing examples. my %FEEDS = ( 'flickr' => 'http://www.flickr.com/services/feeds/photos_public.gne?id=35034346050@N01&format=rss_200', 'links' => 'http://www.gyford.com/phil/links/syndication/index.rdf', 'notes' => 'http://www.gyford.com/phil/notes/syndication/index.rdf', 'writing' => 'http://www.gyford.com/phil/writing/syndication/index.rdf' ); # The filesystem path to the feed you want to create. my $OUTPUT_FEED = '/path/to/gyford.com/docs/phil/syndication/index.rdf'; # How many items do you want in your feed (the n most recent). my $ITEMS_FOR_FEED = 20; # The filesystem path the HTML file you want to create. my $OUTPUT_HTML = '/path/to/gyford.com/docs/includefiles/front_page.html'; # How many items do you want in your file (the n most recent). my $ITEMS_FOR_HTML = 20; # How many Flickr photos do you want appearing in a single day's entry? # If there are more photos for the day, the maximum number (the most recent # ones) will be displayed along with a link to Flickr indicating how many # more photos remain. my $MAX_PHOTOS_PER_DAY = 4; # Your Flickr screen name. my $FLICKR_SCREEN_NAME = 'philgyford'; # If one of the feeds can't be fetched the script uses a local cache. # When a feed is successfully fetched it's saved to the local cache. # This is the filesystem path to that local cache directory. my $CACHE_DIR = '/path/to/a/cache/dir'; # Use cached of the feeds? Handy for testing, but you'll need to have # fetched them successfully first! my $USE_LOCAL = 'false'; # End config ###################################################### # This hash of hashes will contain all the items from all sources. # The keys are made up of the unixtime of the item, and the appropriate # key from %FEEDS, eg 'flickr', 'links', etc. # eg: '1102167975_flickr' # (Why not just unixtime keys? Because there might be two items from # different sources at the same time. You never know.) # Each item will have the following keys: # title - Required # link - Required # description - Required # html_description - Optional - used instead of 'description' in # the HTML file. # time - Required # content:encoded - Optional - used in the RSS if present. my %ITEMS; # Get all the feed items and put them into %ITEMS... get_flickr(); get_links(); get_notes(); get_writing(); # Create the files... create_rss(); create_html(); exit; sub get_flickr { # Gets the items from the Flickr feed, groups them by day # and creates a new entry in ITEMS for that day. # By FAR the most complex feed because we need to do the daily grouping. my $xml; $xml = get_xml('flickr'); # Parse the file. my $rss = new XML::RSS; $rss->parse($xml); # We keep track of the previous photo's day so we know when # we reach a new day. my $prev_photodate; # Will be a Date::Manip object. my $prev_photoday = ''; # Will be yyyy-mm-d. # We'll keep adding info about photos to this array until # we reach a new day, at which point we'll create a new # entry in the global %ITEMS hash for the old day, clear # @todays_photos and start a new day. # This is an array of hashes - each hash being about a photo. my @todays_photos; # Cycle through each photo in turn... foreach my $item (@{$rss->{'items'}}) { my $photodate = ParseDate($item->{'pubDate'}); my $photoday = UnixDate($photodate, "%Y-%m-%d"); if ($photoday ne $prev_photoday && $prev_photoday ne '') { # A new day, so add all the previous day's items into a new 'entry' # in %ITEMS to store for posterity. # For the tag in the RSS. my $day_description; # Used in the HTML version instead of $day_description. my $html_description; my $num_photos_added_today = 0; TODAYS_PHOTO: foreach my $todays_photo (@todays_photos) { if ($num_photos_added_today == $MAX_PHOTOS_PER_DAY) { # We don't want to add any more photos today. # The number of photos for today we're not displaying. my $remaining = scalar(@todays_photos) - $MAX_PHOTOS_PER_DAY; my $plural = $remaining == 1 ? 'photo' : 'photos'; $day_description .= '

See ' . $remaining . " more $plural from this day

\n"; $html_description .= '

See ' . $remaining . " more $plural from this day

\n"; last TODAYS_PHOTO; } # Extract the bits of the description we want. my ($img_path, $width, $height) = ($todays_photo->{'description'} =~ m#src="(.*?)"\swidth="(\d+)"\sheight="(\d+)"#); # The actual 'description' bit, not the title, the image tag but the # paragraphs of description you can enter. my ($img_description) = ($todays_photo->{'description'} =~ m#

.*?

.*?

(.*?)

#s); # Find the size of the thumbnail version. # Flickr feed image (medium size) is max 240 width or height. # Thumbnail is max 100. # Flickr rounds UP when creating smaller size thumbs (hence ceil). if ($width > $height) { $width = 100; $height = ceil ($height * (100/240)); } elsif ($height > $width) { $height = 100; $width = ceil ($width * (100/240)); } else { $width = 100; $height = 100; } # Use thumbnail image, rather than medium size. $img_path =~ s/_m\.jpg/_t.jpg/; $day_description .= '

' . $todays_photo->{'title'} . '
' . "\n" . '' . $todays_photo->{'title'} . ""; $html_description .= '

' . $todays_photo->{'title'} . '

' . $todays_photo->{'title'} . "

\n"; if ($img_description) { # We're not currently using the lengthy descriptions on the HTML page. $day_description .= "
\n$img_description"; } $day_description .= "

\n"; $num_photos_added_today++; } # Add all the previous day's photos to %ITEMS. # The key is like '1102167975_flickr': $ITEMS{ UnixDate($prev_photodate, "%s") . '_flickr' } = { 'title' => 'Photos for ' . UnixDate($prev_photodate, "%e %B %Y"), 'link' => 'http://www.flickr.com/photos/' . $FLICKR_SCREEN_NAME . '/archives/date-posted/' . UnixDate($prev_photodate, "%Y/%m/%d/detail/"), 'description' => $day_description, 'html_description' => $html_description, 'time' => UnixDate($prev_photodate, "%Y-%m-%dT") . '23:59:00' . timezone($photodate) }; # Start afresh for a new day. @todays_photos = ( ); } push @todays_photos, { 'title' => $item->{'title'}, 'link' => $item->{'link'}, 'description' => $item->{'description'} }; $prev_photodate = $photodate; $prev_photoday = $photoday; } } sub get_links { # Nice and simple, get all the items from the links RSS and add them # to %ITEMS. my $xml; $xml = get_xml('links'); # Parse the file. my $rss = new XML::RSS; $rss->parse($xml); foreach my $item (@{$rss->{'items'}}) { # The key is like '1102167975_links': $ITEMS{ UnixDate($item->{'dc'}->{'date'}, "%s") . '_links' } = { 'title' => $item->{'title'}, 'link' => $item->{'link'}, 'description' => $item->{'description'}, 'time' => $item->{'dc'}->{'date'} }; } } sub get_notes { # Nice and simple, get all the items from the notes RSS and add them # to %ITEMS. my $xml; $xml = get_xml('notes'); # Parse the file. my $rss = new XML::RSS; $rss->add_module ( 'prefix' => 'content', 'uri' => 'http://purl.org/rss/1.0/modules/content/' ); $rss->parse($xml); foreach my $item (@{$rss->{'items'}}) { # The key is like '1102167975_writing': $ITEMS{ UnixDate($item->{'dc'}->{'date'}, "%s") . '_notes' } = { 'title' => 'Notes: ' . $item->{'title'}, 'link' => $item->{'link'}, 'description' => $item->{'description'}, 'content:encoded' => $item->{'content'}->{'encoded'}, 'time' => $item->{'dc'}->{'date'} }; } } sub get_writing { # Nice and simple, get all the items from the writing RSS and add them # to %ITEMS. my $xml; $xml = get_xml('writing'); # Parse the file. my $rss = new XML::RSS; $rss->add_module ( 'prefix' => 'content', 'uri' => 'http://purl.org/rss/1.0/modules/content/' ); $rss->parse($xml); foreach my $item (@{$rss->{'items'}}) { # The key is like '1102167975_writing': $ITEMS{ UnixDate($item->{'dc'}->{'date'}, "%s") . '_writing' } = { 'title' => $item->{'title'}, 'link' => $item->{'link'}, 'description' => $item->{'description'}, 'content:encoded' => $item->{'content'}->{'encoded'}, 'time' => $item->{'dc'}->{'date'} }; } } sub create_rss { # Once we've populated %ITEMS, create the RSS file. my $rss = new XML::RSS (version => '1.0'); $rss->channel ( 'title' => "Site Name", 'link' => "http://www.yourdomain.com/", 'description' => "Description of your site", 'dc' => { 'date' => UnixDate('Now', "%Y-%m-%dT%H:%M:%S") . timezone('Now'), 'creator' => 'your@emailaddress', 'publisher' => 'your@emailaddress', 'language' => 'en-gb' # Or 'en-us', etc. } ); $rss->add_module ( 'prefix' => 'content', 'uri' => 'http://purl.org/rss/1.0/modules/content/' ); $rss->image ( 'title' => 'Your name', 'url' => 'http://www.yourdomain.com/path/to/an/image.jpg', 'link' => 'http://www.yourdomain.com/' ); my $count = 0; ITEM: foreach my $timesource (sort {$b cmp $a} keys %ITEMS) { if ($count == $ITEMS_FOR_FEED) { last ITEM; } my %item = ( 'title' => $ITEMS{$timesource}{'title'}, 'link' => $ITEMS{$timesource}{'link'}, 'description' => $ITEMS{$timesource}{'description'}, 'dc' => { 'creator' => 'Your Name', 'date' => $ITEMS{$timesource}{'time'} } ); if ($ITEMS{$timesource}{'content:encoded'}) { $item{'content'}{'encoded'} = ''; } $rss->add_item ( %item ); $count++; } $rss->save($OUTPUT_FEED); } sub create_html { # Once we've populated %ITEMS, create the HTML file. # This will contain the html we output. my $html = ''; open (HTML_FILE, ">$OUTPUT_HTML") or die "Can't open $OUTPUT_HTML: $!"; # These are so we can keep track of which day we're on. # When we hit an item on a new day, we can print a date header. my $prev_itemdate; # Will be a Date::Manip object. my $prev_itemday = ''; # Will be yyyy-mm-d. my $count = 0; ITEM: foreach my $timesource (sort {$b cmp $a} keys %ITEMS) { if ($count == $ITEMS_FOR_HTML) { # We've added enough. last ITEM; } my $itemdate = ParseDate($ITEMS{$timesource}{'time'}); my $itemday = UnixDate($itemdate, "%Y-%m-%d"); # $item_type will be like 'writing', 'flickr', 'links', etc. # So we can do different things for each type if needed. my ($unixtime, $item_type) = split /_/, $timesource; # Not currently used - we could output a date header here. # if ($itemday ne $prev_itemday) { # # A new day, so create a new date header. # if ($prev_itemday ne '') { # $html .= "\t\t\n\t\n"; # } # # $html .= "\t
" . UnixDate($itemdate, "%e %B %Y") . "
\n\t
\n\t\t
\n"; # } $html .= "\t\t
"; $html .= '' . $ITEMS{$timesource}{'title'} . ' Permalink'; $html .= "
\n\t\t
"; if ($ITEMS{$timesource}{'html_description'}) { # There's a special HTML version of the description. $html .= $ITEMS{$timesource}{'html_description'}; } else { $html .= $ITEMS{$timesource}{'description'}; } $html .= "\t\t
\n"; $prev_itemdate = $itemdate; $prev_itemday = $itemday; $count++; } $html = "\t
\n$html
\n"; print HTML_FILE $html; close HTML_FILE; } #################################################### # Functions used by the main functions above. sub get_xml { # Fetches the XML file identified by $FEEDS{$type}. # May get the local version if we're using local, or can't # get the remote one. my ($type) = @_; check_valid_type($type, 'get_xml'); my $xml = ''; # Get the file. if ($USE_LOCAL eq 'true') { $xml = get_local_feed($type); } else { my $feed_url = $FEEDS{$type}; if (defined ($xml = get $feed_url)) { write_local_feed($type, $xml); } else { print "Couldn't get $feed_url - using local cache.\n"; $xml = get_local_feed($type); } } return $xml; } sub write_local_feed { # When we've got the remote feed we'll write a copy locally. my ($type, $text) = @_; check_valid_type($type, 'write_local_feed'); open (FEED, "> $CACHE_DIR/$type.txt") or die "Couldn't open $CACHE_DIR/$type.txt for writing: $!\n"; print FEED $text; close (FEED); } sub get_local_feed { # Fetches the contents of the cached copy of a feed. my ($type) = @_; check_valid_type($type, 'get_local_feed'); open (FEED, "< $CACHE_DIR/$type.txt") or die "Couldn't open $CACHE_DIR/$type.txt for reading: $!\n"; # Read whole contents into a string. undef $/; my $text = ; close (FEED); return $text; } sub check_valid_type { # Checks that $type is valid. my ($type, $function) = @_; if (!exists $FEEDS{$type}) { die "'$type' is not a valid feed type in $function.\n"; } } sub timezone { my $time = shift(@_); # UnixDate only does timezones like '+0100' and we need '+01:00'. # So we have to fix it manually here. Ugh. my $timezone = UnixDate($time, "%z"); $timezone =~ s/(...)(..)/$1:$2/; return $timezone; }