1
0
Fork 0
mirror of https://github.com/perlbot/perlbuut synced 2025-06-07 10:35:41 -04:00
perlbuut/deps/IMDB/lib/.svn/text-base/IMDB.pm.svn-base
2009-12-05 00:02:04 -05:00

241 lines
4.8 KiB
Text

package IMDB;
use HTML::TreeBuilder;
use URI;
use LWP::Simple qw/get/;
use URI::Escape qw/uri_escape/;
use strict;
sub normalize_title
{
my( $self, $title ) = @_;
$title =~ tr/'"[]//d;
$title =~ s/ +/ /g;
return $title;
}
sub new
{
my( $class, $title ) = @_;
my $self = bless {}, $class;
my ($uri,$tree) = $self->search( $title );
return unless defined $tree;
$self->get_title($tree);
$self->get_basic_info($tree);
for( qw/plotsummary fullcredits trivia quotes/ )
{
warn "Fetching $uri$_\n";
my $html = get( "$uri$_" );
my $tree = HTML::TreeBuilder->new;
$tree->parse($html);
$tree->eof;
my $method_name = "get_$_";
$self->$method_name($tree);
}
return $self;
}
sub search
{
my( $self, $title ) = @_;
my $search_uri = "http://www.imdb.com/find?s=all&q=" . uri_escape($self->normalize_title($title));
warn "Fetching $search_uri\n";
my $html = get($search_uri);
my $tree = HTML::TreeBuilder->new;
$tree->parse($html);
$tree->eof;
if( not $tree->look_down(_tag => "title")->as_text =~ /IMDb.*Search/ )
{
my $link = $tree->look_down( _tag => 'a', href => qr#/title/tt# );
$link->attr('href') =~ m'(/title/tt\d+)';
return ( "http://www.imdb.com$1",$tree );
}
#This gets the initial header for the search results. <h2>Popular Results</h2>
my $top_ele = $tree->look_down( _tag => 'b', sub { $_[0]->as_text eq 'Popular Titles' } );
if( $top_ele and ($top_ele->parent->content_list)[2] )
{
$top_ele = ($top_ele->parent->content_list)[2]; #Should be the opening tag for the follow list of links.
}
else
{
$top_ele = $tree->look_down( _tag => 'b', sub { $_[0]->as_text eq 'Titles (Exact Matches)' } );
if( $top_ele )
{
$top_ele = ($top_ele->right)[1];
}
else
{
$top_ele = $tree->look_down( _tag => 'b', sub { $_[0]->as_text eq 'Titles (Approx Matches)' } );
if( $top_ele )
{
$top_ele = ($top_ele->right)[1];
}
else
{
$top_ele = $tree->look_down( _tag => 'b', sub { $_[0]->as_text eq 'Titles (Partial Matches)' } );
if( $top_ele )
{
$top_ele = ($top_ele->right)[1];
}
else
{
warn "Error, could not find a useful result for term $title\n";
return;
}
}
}
}
# warn "Top ele -- ", $top_ele->as_HTML;
my $first_link = $top_ele->look_down( _tag => 'a' ); #We only want the first link anyway.
my $path = URI->new($first_link->attr('href'))->path;
my $uri = URI->new_abs( $path, "http://imdb.com");
$self->{data}->{uri} = $uri;
my $actual_html = get( $uri );
my $new_tree = HTML::TreeBuilder->new;
$new_tree->parse($actual_html);
return ($uri,$new_tree);
}
sub get_title
{
my( $self, $tree ) = @_;
my $title = $tree->look_down(_tag => "title");
$self->{data}->{title} = $title->as_text if $title;
}
sub get_basic_info
{
my( $self, $tree ) = @_;
my $rdate = $tree->look_down( _tag => 'h5', sub { $_[0]->as_text eq 'Release Date:' } );
$self->{data}->{release_date} = $rdate->right if $rdate;
my $genre_title = $tree->look_down( _tag => 'h5', sub { $_[0]->as_text eq 'Genre:' } );
if( $genre_title )
{
my @genres = $genre_title->right;
pop @genres; #Remove the "more.." link.
if( @genres )
{
$self->{data}->{genre} .= (ref $_ ? $_->as_text : $_) for @genres;
}
}
}
sub get_plotsummary
{
my( $self, $tree ) = @_;
my $first_summary = $tree->look_down( _tag => 'p', class => 'plotpar' );
if( $first_summary )
{
$self->{data}->{summary} = $first_summary->as_text;
}
}
sub get_quotes
{
my( $self, $tree ) = @_;
my $first_link = $tree->look_down( _tag => "a", name => qr/qt\d+/ );
return unless $first_link;
my @quote_eles = ($first_link,$first_link->right);
my $quotes;
for( my $i = 0; $i < $#quote_eles; $i++ )
{
local $_ = $quote_eles[$i];
if( ref $_ and $_->tag eq 'a' and $_->attr('name') =~ /qt\d+/ )
{
my @quote;
my $start = $i;
for( $i; $i < @quote_eles; $i++ )
{
local $_ = $quote_eles[$i];
if( ref $_ and ( $_->tag eq 'hr' or $_->tag eq 'div' ) )
{
last;
}
if( ref $_ and $_->tag eq 'i' )
{
$quote[-1] .= $_->as_text;
$i++;
#Hrm, this should probably always be plain text..
$quote[-1] .= ref $quote_eles[$i] ? $quote_eles[$i]->as_text : $quote_eles[$i];
}
else
{
my $str = ref $_ ? $_->as_text : $_;
if( $str =~ /\S/ ) { push @quote, $str }
}
}
for( my $j = 0; $j < @quote; $j++ )
{
if( $quote[$j] =~ /:/ )
{
$quote[$j-1].=$quote[$j];
$quote[$j]='';
}
}
s/^\s+//,s/\s+$// for @quote;
@quote = grep length $_, @quote;
push @$quotes, \@quote;
}
}
$self->{data}->{quotes} = $quotes;
}
sub get_trivia
{
my( $self, $tree ) = @_;
my @trivia;
for my $ul ($tree->look_down( _tag => "ul", class => "trivia" ) )
{
for( $ul->look_down( _tag => "li" ) )
{
push @trivia, $_->as_text;
}
}
$self->{data}->{trivia} = \@trivia;
}
sub get_fullcredits
{
my( $self, $tree ) = @_;
}
1;