#!/usr/bin/perl -w
use strict;
$|=1;
use Dumpvalue;
use List::Util;
my $d= Dumpvalue->new();
my $svr= SneakServer->new("http://www.sneakpoint.nl");
my $sneakinfo= SneakList->new($svr);
my $sneakstats= SneakStats->new($sneakinfo->{cinemalist}, $sneakinfo->{sneaklist}, $sneakinfo->{newslist});
my $primarycinema= $sneakstats->{bycinema}{delf001};
$sneakstats->differencedump($primarycinema);
exit 0;
#############################################################################
#############################################################################
package SneakServer;
# this gets data from the sneakpoint webserver, or from the ".cache/" directory.
use HTTP::Request::Common qw(POST GET);
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Cookies;
sub new {
my ($class, $baseurl)= @_;
my $ua= LWP::UserAgent->new();
# $ua->agent("Mozilla/4.76 [en] (X11; U; Linux 2.4.9 i686)");
# $ua->cookie_jar(HTTP::Cookies->new());
# $ua->env_proxy();
return bless {
ua=>$ua,
baseurl=>$baseurl,
cache=>FileCache->new(),
}, $class;
}
sub getCachedNewsPage {
my ($self)= @_;
return $self->{cache}->getCachedFile("index.html", $self, \&getPage, "index.php");
}
sub getCachedCinemaListPage {
my ($self)= @_;
# changed 'plaatsen.php' to 'bioscopen.php'
return $self->{cache}->getCachedFile("plaatsen.html", $self, \&getPage, "bioscopen.php");
}
sub getCachedCinemaPage {
my ($self, $cinema, $year)= @_;
return $self->{cache}->getCachedFile("$cinema-$year.html", $self, \&getPage, "bioscoop.php?refer=plaatsen&biosid=$cinema&jaar=$year");
}
sub getPage {
my ($self, $path)= @_;
my $rq= GET "$self->{baseurl}/$path";
my $rp= $self->{ua}->request($rq) or die "error reading $path";
if (!$rp->is_success) {
die "error reading $self->{baseurl}/$path: ", $rp->status_line, "\n";
}
return $rp->content;
}
#############################################################################
#############################################################################
package SneakList;
# this gets the cinema list + the sneak list.
use HTML::TreeBuilder;
use Date::Manip;
#old:
@0.1.0.1.0.2.5.2.1.2
#new:
@0.1.2.0.0.3.0.0.0.2.2.1
use constant CINEMATABLEADDRESS =>"0.1.0.1.0.2.5";
use constant NEWSPAGEADDRESS => "0.1.0.1.0.2.19";
use constant NEWCINEMAPAGEADDRESS => "0.1.0.1.0.2.7";
use constant OLDCINEMAPAGEADDRESS => "0.1.0.1.0.2.7";
sub new {
my ($class, $htmlsvr)= @_;
Date_Init("Language=Dutch", "DateFormat=non-US");
my $self= bless {
htmlsvr=>$htmlsvr,
cache=>FileCache->new(),
}, $class;
$self->{cinemalist}= $self->getCachedCinemaList();
$self->{sneaklist}= $self->getCachedSneakList();
$self->{newslist}= $self->getCachedNewsList();
return $self;
}
sub getCachedNewsList {
my ($self)= @_;
return $self->{cache}->getCachedCsvFile("news.csv", $self, \&getNewsList);
}
sub getNewsList {
my ($self)= @_;
return $self->parseNewsPage($self->{htmlsvr}->getCachedNewsPage());
}
sub parseNewsPage {
my ($self, $html)= @_;
my @newslist;
my $h = HTML::TreeBuilder->new();
$h->parse($html);
$h->eof();
# tree-builder address: 0.1.$letter.1.$country.0
my $currentdate= undef;
if (!$h->address(NEWSPAGEADDRESS)) {
die "html has changed in news page\n";
}
for my $tr ($h->address(NEWSPAGEADDRESS)->content_list()) {
my @cells= $tr->content_list();
if (@cells==1) {
$currentdate= ParseDate($cells[0]->address(".0.0"));
print "DATEERROR" unless (defined $currentdate);
}
elsif (@cells==2) {
my $href= $cells[0]->address(".0")->{href};
my ($cinemaid)= ($href =~ /biosid=(\w+)/);
my $movie= $cells[1]->address(".0");
if (ref $movie eq "HTML::Element") {
next; # onbekend
}
my $sneak= {date=>$currentdate, movie=>$movie, cinema=>$cinemaid,
_uxdate=>int((UnixDate($currentdate, "%s")+8*3600)/86400)};
push @newslist, $sneak;
}
else {
die "could not parse index page\n";
}
}
return \@newslist;
}
#############################################################################
sub getCachedCinemaList {
my ($self)= @_;
return $self->{cache}->getCachedCsvFile("cinemalist.csv", $self, \&getCinemaList);
}
sub getCinemaList {
my ($self)= @_;
return $self->parseCinemaList($self->{htmlsvr}->getCachedCinemaListPage());
}
# parses html containing list of cities + cinemas, retuns array of cinemas
sub parseCinemaList {
my ($self, $html)= @_;
my @cinemas;
my $h = HTML::TreeBuilder->new();
$h->parse($html);
$h->eof();
# tree-builder address: 0.1.$letter.1.$country.0
my $currentcity= undef;
if (!$h->address(CINEMATABLEADDRESS)) {
die "html has changed in cinema list\n";
}
for my $tr ($h->address(CINEMATABLEADDRESS)->content_list()) {
my @cells= $tr->content_list();
next if (@cells!=2);
$currentcity = getCityFromCell($cells[0]) || $currentcity;
my ($cinemaid, $cinemaname)= getCinemaInfoFromCell($cells[1]);
push @cinemas, { id=>$cinemaid, name=>$cinemaname, city=>$currentcity };
}
return \@cinemas;
}
sub getCityFromCell {
my $td= shift;
my $content= $td->address(".0"); # ($td->content_list())[0];
if (ref \$content eq "SCALAR") {
$content =~ s/\xa0*$//;
return $content;
}
}
# returns cinema id + cinema name
sub getCinemaInfoFromCell {
my $td= shift;
my $href= $td->address(".1"); # ($td->content_list())[1];
my $url= $href->{href};
my $name= $href->address(".0"); #($href->content_list())[0];
my ($id)= ($url =~ /biosid=(\w+)/) or die "invalid url : $url\n";
return ($id, $name);
}
#############################################################################
sub getCachedSneakList {
my ($self)= @_;
return $self->{cache}->getCachedCsvFile("sneaklist.csv", $self, \&getSneakList);
}
sub getSneakList {
my ($self)= @_;
my @list;
# get sneak list for each cinema.
for my $cinema (@{$self->{cinemalist}}) {
print "checking $cinema->{id}\n";
my $sneaklist= $self->getCachedSneakListForCinema($cinema->{id});
push @list, @{$sneaklist} if ($sneaklist);
}
return \@list;
}
#############################################################################
sub getCachedSneakListForCinema {
my ($self, $cinemaid)= @_;
return $self->{cache}->getCachedCsvFile("sneaklist-$cinemaid.csv", $self, \&getSneakListForCinema, $cinemaid);
}
sub getSneakListForCinema {
my ($self, $cinemaid)= @_;
my @list;
appendToList(\@list, $self->parseCinema($cinemaid, 2002, $self->{htmlsvr}->getCachedCinemaPage($cinemaid, 2002)));
appendToList(\@list, $self->parseCinema($cinemaid, 2001, $self->{htmlsvr}->getCachedCinemaPage($cinemaid, 2001)));
appendToList(\@list, $self->parseCinema($cinemaid, 2000, $self->{htmlsvr}->getCachedCinemaPage($cinemaid, 2000)));
return \@list;
}
sub appendToList {
my ($list1, $list2)= @_;
push @$list1, @$list2 if ($list2 && @$list2);
}
# method parses html page for cinema, returns list of sneaks.
sub parseCinema {
my ($self, $cinemaid, $year, $html)= @_;
my $h = HTML::TreeBuilder->new();
$h->parse($html);
$h->eof();
# determine year of this page.
# my $year;
# return undef unless ($h->address("0.1.0.1.0.2.6.1.0.0"));
# for my $yeartag ($h->address("0.1.0.1.0.2.6.1.0.0")->content_list()) {
# if (ref $yeartag && $yeartag->tag() eq "b")
# {
# $year= $yeartag->address(".0");
# last;
# }
# }
# if (!defined $year) { print "could not determine year for file $cinemaid\n"; }
if ($year == 2000) {
return $self->parseCinemaOld($cinemaid, $year, $h);
}
else {
return $self->parseCinemaNew($cinemaid, $year, $h);
}
}
sub parseCinemaNew {
my ($self, $cinemaid, $year, $h)= @_;
my @sneaks;
if (!$h->address(NEWCINEMAPAGEADDRESS)) {
die "cinema format has changed\n";
}
# extract list of dates+movies
return undef unless (ref $h->address(NEWCINEMAPAGEADDRESS) eq "HTML::Element");
for my $tr_twomonths ($h->address(NEWCINEMAPAGEADDRESS)->content_list()) {
for my $td_month ($tr_twomonths->content_list()) {
last unless (ref $td_month->address(".0") eq "HTML::Element");
for my $td_date ($td_month->address(".0")->content_list()) {
my @cells= $td_date->content_list();
next if (@cells!=2);
my $sneak= $self->getSneak($cinemaid, $year, $cells[0], $cells[1]);
push @sneaks, $sneak if ($sneak);
}
}
}
return \@sneaks;
}
# parse year 2000 style pages.
sub parseCinemaOld {
my ($self, $cinemaid, $year, $h)= @_;
my @sneaks;
if (!$h->address(OLDCINEMAPAGEADDRESS)) {
die "cinema format has changed\n";
}
# extract list of dates+movies
return undef unless (ref $h->address(OLDCINEMAPAGEADDRESS) eq "HTML::Element");
for my $tr_twomonths ($h->address(OLDCINEMAPAGEADDRESS)->content_list()) {
my @cells= $tr_twomonths->content_list();
next if (@cells!=4); # month header line.
my $sneak1= $self->getSneak($cinemaid, $year, $cells[0], $cells[1]);
push @sneaks, $sneak1 if ($sneak1);
my $sneak2= $self->getSneak($cinemaid, $year, $cells[2], $cells[3]);
push @sneaks, $sneak2 if ($sneak2);
}
return \@sneaks;
}
sub getSneak {
my ($self, $cinemaid, $year, $dateelem, $movieelem)= @_;
my $movie= $movieelem->address(".0");
$movie =~ s/^\xa0*//;
my $datestring= $dateelem->address(".0");
my $date= convertDate($year, $datestring);
if (!$date) {
print "skipping $movie for $cinemaid - no date\n";
return;
}
if ($movie eq "Geen Sneak") {
print "skipping $datestring for $cinemaid - no sneak\n";
return;
}
return {date=>$date, movie=>$movie, cinema=>$cinemaid,
_uxdate=>int((UnixDate($date, "%s")+8*3600)/86400)};
}
# convert dutch date to days since 1-1-1970
sub convertDate {
my ($year, $date)= @_;
if ($date !~ $year) { $date .= " $year"; }
return ParseDate($date);
}
#############################################################################
#############################################################################
package SneakStats;
sub new {
my ($class, $cinemalist, $sneaklist, $newslist)= @_;
my $self= bless {
cinemalist=>$cinemalist,
sneaklist=>$sneaklist,
newslist=>$newslist,
}, $class;
# calculate index for cinema-id => cinema-info
for my $cinema (@{$self->{cinemalist}}) {
$self->{bycinema}{$cinema->{id}}= $cinema;
}
# calculate per cinema index by date, by movie.
for my $sneak (@{$self->{sneaklist}}) {
my $cinema= $self->{bycinema}{$sneak->{cinema}};
$cinema->{bydate}{$sneak->{date}}= $sneak;
$cinema->{bymovie}{$sneak->{movie}}= $sneak;
push @{$self->{bydate}{$sneak->{date}}}, $sneak;
push @{$self->{bymovie}{$sneak->{movie}}}, $sneak;
}
# print "error in $cinema->{id}" if (keys %{$cinema->{bymovie}} != keys %{$cinema->{bydate}});
for my $sneak (@{$self->{newslist}}) {
next if ($sneak->{cinema});
my $cinema= $self->{bycinema}{$sneak->{cinema}};
$cinema->{bydate}{$sneak->{date}}= $sneak;
$cinema->{bymovie}{$sneak->{movie}}= $sneak;
push @{$self->{bydate}{$sneak->{date}}}, $sneak;
push @{$self->{bymovie}{$sneak->{movie}}}, $sneak;
}
return $self;
}
sub difference {
my ($self, $cinemaname1, $cinemaname2)= @_;
my ($missing, $dist0, $dist1, $dist2)= (0,0,0,0);
my ($cinema1, $cinema2)= map { $self->{bycinema}{$_} } ($cinemaname1, $cinemaname2);
for my $sneak1 (@{$cinema1->{sneaks}}) {
my $sneak2= $cinema2->{bymovie}{$sneak1->{movie}};
if (!$sneak2) {
$missing++;
next;
}
my $diff= $sneak1->{_uxdate} - $sneak2->{_uxdate};
$dist0++;
$dist1+= abs($diff);
$dist2+= $diff**2;
}
if ($dist0) {
return ($dist1/$dist0, sqrt($dist2)/$dist0, $missing);
}
else {
return (undef, undef, $missing);
}
}
sub differencedump {
my ($self, $primarycinema)= @_;
# calculate differences
for my $cinema (@{$self->{cinemalist}}) {
for my $sneakdate (sort keys %{$primarycinema->{bydate}}) {
my $primarysneak= $primarycinema->{bydate}{$sneakdate};
my $sneak= $cinema->{bymovie}{$primarysneak->{movie}};
if ($sneak) {
my $diff= $sneak->{_uxdate} - $primarysneak->{_uxdate};
$cinema->{diffs}{$diff}++;
}
else {
$cinema->{misses}++;
}
}
}
# print results
for my $cinemaid (sort {maxdiff($b)<=>maxdiff($a) || misses($a)<=>misses($b)} keys %{$self->{bycinema}}) {
my $cinema= $self->{bycinema}{$cinemaid};
printf("%-8s", $cinema->{id});
for my $sneakdate (sort keys %{$primarycinema->{bydate}}) {
my $primarysneak= $primarycinema->{bydate}{$sneakdate};
my $sneak= $cinema->{bymovie}{$primarysneak->{movie}};
if ($sneak) {
my $diff= $sneak->{_uxdate} - $primarysneak->{_uxdate};
printf(" %3d", $diff);
}
else {
printf(" -");
}
}
my $diffs= $cinema->{diffs};
print " * ", join ", ", map { sprintf("%3s:%2d", $_, $diffs->{$_}) } sort {$diffs->{$b}<=>$diffs->{$a}} keys %$diffs;
print "\n";
}
}
sub maxdiff {
my $id= shift;
my $max= List::Util::max(values %{$sneakstats->{bycinema}{$id}{diffs}});
return 0 if (!defined $max);
return $max;
}
sub misses {
my $id= shift;
my $misses= $sneakstats->{bycinema}{$id}{misses};
return 0 if (!defined $misses);
return $misses;
}
#############################################################################
#############################################################################
package FileCache;
use constant CACHENAME=> ".cache";
sub new {
my ($class)= @_;
if (! -e CACHENAME) {
mkdir CACHENAME;
}
if (! -d CACHENAME) {
die "cache ", CACHENAME, " should be a directory\n";
}
return bless {}, $class;
}
sub isInCache {
my ($self, $filename)= @_;
return -e CACHENAME."/$filename";
}
sub readFromCache {
my ($self, $filename)= @_;
my $fh= IO::File->new();
$fh->open(CACHENAME."/$filename", "<") or die "error locating $filename in cache : $!";
my $buf;
$fh->read($buf, -s CACHENAME."/$filename") or die "error reading from file: $!\n";
$fh->close();
print "loaded $filename from cache\n";
return $buf;
}
sub writeToCache {
my ($self, $filename, $contents)= @_;
my $fh= IO::File->new();
$fh->open(CACHENAME."/$filename", ">") or die "error creating $filename in cache : $!";
$fh->write($contents) or die "error writing to file: $!\n";
$fh->close();
print "wrote ", length($contents), " bytes to $filename\n";
}
sub writeCsvToCache {
my ($self, $filename, $list)= @_;
return if (!$list);
return if (!@$list);
return $self->writeToCache($filename, $self->convertToCsv($list));
}
sub readCsvFromCache {
my ($self, $filename)= @_;
return $self->parseCsvFile($self->readFromCache($filename));
}
sub parseCsvFile {
my ($self, $contents)= @_;
my @lines= split /\n/, $contents;
my @keys= $self->parseKeysLine($lines[0]) or die "no keys line\n";
my @data;
for(my $i=1 ; $i<=$#lines ; $i++) {
my @values= $self->parseCsvLine($lines[$i]) or die "invalid line $lines[$i]\n";
die "incorrect line $i $lines[$i]\n" if ($#values != $#keys);
push @data, { map { $keys[$_]=>$values[$_] } (0..$#keys) };
}
return \@data;
}
# keys line start with comment char ("#"),
# followed by comma separated field names.
sub parseKeysLine {
my ($self, $line)= @_;
return if (! ($line =~ s/^#\s*//));
return split /,\s*/, $line;
}
# csv line are double-quoted strings, separated by commas (",")
# double quotes are escaped with backslash.
sub parseCsvLine {
my ($self, $line)= @_;
chomp $line;
my @values= map { s/\\(.)/$1/g; $_ } split /",\s*"/, $line;
$values[0] =~ s/^\s*"//;
$values[-1] =~ s/"\s*$//;
return @values;
}
sub convertToCsv {
my ($self, $list)= @_;
my @lines;
return if (!$list);
return if (!@$list);
my @keys= keys %{$list->[0]};
push @lines, $self->convertKeysToCsv(@keys);
for my $item (@$list) {
push @lines, $self->convertValuesToCsv(map {$item->{$_}} @keys);
}
return join "\n", @lines;
}
sub convertKeysToCsv {
my ($self, @keys)= @_;
return "#".join(",",@keys);
}
sub convertValuesToCsv {
my ($self, @values)= @_;
return "\"".join("\",\"", map { $_ && s/(["\\])/\\$1/g; $_ || "" } @values)."\"";
}
sub getCachedCsvFile {
my ($self, $cachename, $object, $method, @params)= @_;
if ($self->isInCache($cachename)) {
return $self->readCsvFromCache($cachename);
}
my $list= $object->$method(@params);
$self->writeCsvToCache($cachename, $list);
return $list;
}
sub getCachedFile {
my ($self, $cachename, $object, $method, @params)= @_;
if ($self->isInCache($cachename)) {
return $self->readFromCache($cachename);
}
my $data= $object->$method(@params);
$self->writeToCache($cachename, $data);
return $data;
}