User talk:Davide Eynard

Hi Davide,

I'm a student at the University of British Columbia in Vancouver, Canada, and I've got a wiki running for a science group I volunteer with.

I've been trying to set up a perl screen scrape of our news page so that I can generate a valid RSS feed, but my current script generates a feed with unwanted characters. Do you have any suggestions about how to improve it?

Thanks,

Kyle Hunter


 * 1) !/usr/bin/perl
 * 2) script for creating RSS feed from Wikipedia Recent Announcements page
 * 3) note, XML::RSS does not support RSS 0.92.

use strict; use diagnostics; use LWP::UserAgent; use HTTP::Date;

use constant GETFILE => 1;

$/ = undef;

my $url_link_base = "http://www.sciteam.ubc.ca/mw/index.php/SCI_Team:Current_News#"; my $url_announcements = "http://www.sciteam.ubc.ca/mw/index.php?title=SCI_Team:05-06/Current_News&action=edit"; my $url_meta = "http://meta.wikipedia.org/wiki/"; my $url_wiki = "http://en.wikipedia.org/wiki/"; my $file_template = "template.xml"; my $file_output = "output-new.txt"; my $file_rss = "news-new.xml"; my $file_original = "original-new.htm";

my $in;

if (GETFILE) { # get the file remotely, otherwise just use what we have (save hits during debugging) # Create a user agent object (spoof Mozilla since it rejects libwww) my $ua = LWP::UserAgent->new; $ua->agent("Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030428 Mozilla Firebird/0.6 StumbleUpon/1.73");

my $req = HTTP::Request->new(GET => $url_announcements); my $res = $ua->request($req); # Check the outcome of the response if ($res->is_success) { #print $res->content; $in = $res->content; open(ORIGINAL, '>', $file_original); print ORIGINAL $in; close(ORIGINAL); } else { print "Bad luck this time ", $res->code, "\n"; exit; } } else { # we are just getting the local file. open(ORIGINAL, '<', $file_original); $in = ; close(ORIGINAL); }

sub remove_html_tag { my ($temp, $tag) = @_; $temp =~ s/(<$tag.*?>|<\/$tag>)//gsmi; return $temp; }

sub anchor { my ($temp) = @_; $temp = linkify($temp); $temp = $url_link_base. $temp; return $temp; }

sub escape { my ($temp) = @_; $temp =~ s//&gt;/gism; return $temp; }

sub linkify { my ($temp) = @_; $temp =~ s/ /_/g; return $temp; }

sub createlink { my ($url, $name) = @_; my $temp = "$name"; #print "Created link: $temp\n"; $temp = escape("$name"); return $temp; }

sub metalink { my ($url, $name) = @_; $url = linkify($url); return createlink($url_meta. $url, $name); }

sub wikilink { my ($url, $name) = @_; #print "Calling wikilink on:\n"; #print "URL: $url\n"; #print "Name: $name\n\n"; $url = linkify($url); return createlink($url_wiki . $url, $name); }

sub fixtitle { my ($title) = @_; $title =~ s/\[\[//g; $title =~ s/]]//g; return $title; }

sub wiki2html { my ($temp) = @_; # * is break $temp =~ s/\n[\*\:]/&lt;br&gt;&lt;br&gt;/gism; # deal with strong markups. $temp =~ s/(.*?)/&lt;strong&gt;$1&lt;\/strong&gt;/gi; # links to Meta (do we need this?   #$temp =~ s/\[\[m:([^|]*?)]]/metalink($1, $2)/gisme;    # links to Meta    $temp =~ s/\[\[m:(.*?)\|(.*?)]]/metalink($1, $2)/gisme;    #regular wiki links Canada     # | is a literal in square brackets.    $temp =~ s/\[\[([^|]*?)]]/wikilink($1, $1)/gisme;    # wiki links i.e. USA    $temp =~ s/\[\[(.*?)\|(.*?)]]/wikilink($1, $2)/gisme;    # deal with external links    $temp =~ s/\[(.*?) (.*?)]/createlink($1, $2);/gisme;     # single form of external link    $temp =~ s/\[(.*?)]/createlink($1, $1);/gisme;     # remove starting  if necessary    $temp =~ s/\s*&lt;br&gt;&lt;br&gt;//gism;    return $temp; }

$in =~ m|(.*?) |ism;

$in = $1;

$in =~ s|.*?==\s*?Current news.*?==||gism; $in =~ s|==\s*?Number of article milestones\s*?==.*||gism; $in =~ s|==\s*?Announcement Archives\s*?==.*||gism; $in =~ s|==\s*?News Archives\s*?==.*||gism;


 * 1) get rid of stuff.

open(OUT, '>', $file_output); print OUT $in; close(OUT);
 * 1) store output

my @lines = split(/\n/, $in); my @content; my @result; my @items;

@result = $in =~ m|==\s*(.*?)\s*==(.*?)(?===)|gism;

for (0 .. $#result) { $result[$_] =~ s/^\s*?//gism; $result[$_] =~ s/\s*?$//gism; if ($_ % 2 == 1) { # ODD if we are dealing with an item

# trim initial *. $result[$_] =~ s/^\s*?\*//g; @items = split /\n\*+/, $result[$_]; for (0 .. $#items) { print "Item $_: $items[$_]\n\n"; }

$result[$_] = '&lt;p&gt;'. join('&lt;/p&gt;&lt;p&gt;', @items). '&lt;/p&gt;';

#print $result[$_]. "\n\n"; }   else { # EVEN, do nothing with the date } }

for (0 .. $#result) { }
 * 1)   print "$_: $result[$_]\n";

for my $i (0 .. ($#result-1)/2) { my %temphash; $temphash{"title"} = fixtitle($result[$i*2]); $temphash{"link"} = anchor($result[$i*2]); $temphash{"description"} = wiki2html($result[$i*2+1]); push @content, \%temphash; }
 * 1)   print "$i *$result[$i]*\n";

$/ = "\n";

open(TEMPLATE, '<', $file_template); open(RSS, '>', $file_rss);

my $key;

while () {

if (/\$content\$/) { for $key (0 .. $#content) { my $item = " \n"; $item .= " ". $content[$key]->{"title"}. " \n"; $item .= " ". $content[$key]->{"description"}. " \n"; $item .= " ". $content[$key]->{"link"}. " \n"; $item .= " \n\n"; print RSS $item; }       next; }   if (/\$lastbuilddate\$/) { my $temp = $_; $temp =~ s/\$lastbuilddate\$/time2str(time)/e; print RSS $temp; next; }   print RSS $_; } close(TEMPLATE); close(RSS);