Capable of scraping all of an authors stuff

This commit is contained in:
Ryan Voots 2015-12-05 19:12:52 -08:00
parent fad7ac8e68
commit 1fdefd9b07
5 changed files with 174 additions and 30 deletions

View file

@ -5,8 +5,13 @@ use warnings;
use lib './lib'; use lib './lib';
use Data::Dumper;
use Shapeways::Account;
use Shapeways::Item; use Shapeways::Item;
my $item = Shapeways::Item->new_from_url('http://www.shapeways.com/product/TCE9QM4VD/succubus?li=user-profile'); my $item = Shapeways::Item->new_from_url('https://www.shapeways.com/product/TCE9QM4VD/succubus?li=user-profile');
$item->serialize();
#my $account = Shapeways::Account->new(account_name => 'mz4250');
$item->serialize; #$account->serialize;

View file

@ -7,6 +7,7 @@ use HTML::TreeBuilder::XPath;
use Data::Dumper; use Data::Dumper;
use JSON::MaybeXS; use JSON::MaybeXS;
use Shapeways::Logger;
use Shapeways::Item; use Shapeways::Item;
class_has mech => ( class_has mech => (
@ -30,13 +31,16 @@ sub get_url {
my $self = shift; my $self = shift;
my ($url) = @_; my ($url) = @_;
log_debug {"Fetching: ", $url};
my $res = $self->mech->get($url); my $res = $self->mech->get($url);
if ($res->is_success) { if ($res->is_success) {
my $tree_path = HTML::TreeBuilder::XPath->new_from_content($res->content); my $tree_path = HTML::TreeBuilder::XPath->new_from_content($res->content);
log_debug {"Fetch success"};
return ($tree_path, 200); return ($tree_path, 200);
} else { } else {
log_error {"Fetch failure => ", $res->code, " ", $res->status_line};
return (undef, $res->code); return (undef, $res->code);
} }
} }
@ -45,20 +49,17 @@ sub _build_mech {
my $self = shift; my $self = shift;
my $mech = WWW::Mechanize::GZip->new(); my $mech = WWW::Mechanize::GZip->new();
log_info {"Logging in"};
my $login = $mech->post('https://www.shapeways.com/login/json-shapeways', $self->config); # TODO better config? my $login = $mech->post('https://www.shapeways.com/login/json-shapeways', $self->config); # TODO better config?
my $s = decode_json $login->content; my $s = decode_json $login->content;
if ($s->{success}) { if ($s->{success}) {
log_debug {"Logged in succesfully"};
return $mech return $mech
} else { } else {
log_debug {"Failed to log in: ", $login->content};
die "login failed"; die "login failed";
} }
} }
sub fetch_user {
}
sub fetch_list { # Do these exist? maybe this is a cart?
}
1; 1;

View file

@ -1,17 +1,100 @@
package Shapeways::Account; package Shapeways::Account;
use Moose; use Moose;
use MooseX::Storage;
use Shapeways; use Shapeways;
use Shapeways::Logger;
use List::MoreUtils qw/uniq/;
use Data::Dumper;
with Storage(format => 'JSON');
has account_name => ( has account_name => (
is => 'ro',
); );
has account_id => ( has creations => (
);
has items => (
is => 'ro', is => 'ro',
lazy => 1, lazy => 1,
default => sub {...} builder => '_get_creations',
); );
has favorites => (
is => 'ro',
lazy => 1,
builder => '_get_favorites',
);
# TODO this will make a list of their lists that contains items. probably wants a new class
has lists => (
is => 'ro',
lazy => 1,
builder => '_get_lists',
);
sub _parse_page {
my $self = shift;
my $tree = shift;
# Gotta uniq this since the link is duplicated for text and pictures. easier to just do this than futz with the xpath
my @items = uniq $tree->findvalues('//a[@class="product-url"]/@href');
my $next_page = ($tree->findvalues('//a[span[contains(@class,"results-page-next")]]/@href'))[0];
return {
items => \@items,
next_page => $next_page,
};
}
sub _get_creations {
my $self = shift;
# first page starts here
my $next_page = 'http://www.shapeways.com/designer/'. $self->account_name;
my @item_urls;
# Loop until we failed to find another page
while ($next_page) {
my ($tree_path, $code) = Shapeways->get_url($next_page);
my $data = $self->_parse_page($tree_path);
push @item_urls, @{$data->{items}};
$next_page = $data->{next_page};
}
my @items = map {Shapeways::Item->new_from_url($_)} @item_urls;
return \@items;
}
sub _get_lists {return undef};
sub _get_favorites {return undef};
sub serialize {
my $self = shift;
my %opts = (
mkdir => 1, # make a directory for the files
dirname => $self->account_name,
path => './',
@_
);
# Download the file, rename it via the title, save a corrosponding .txt file with the description, title and original file name
log_info {"Making author dir: ", $self->account_name};
my $dir = $opts{path}.'/'.$opts{dirname}.'/';
log_debug {"dir => ", $dir};
mkdir($dir) or die "Couldn't mkdir $dir => $!";
# TODO save more than the first item
for my $item ($self->creations->[0]) {
# Serialize under the author directory
$item->serialize(path => $dir);
}
}
1;

View file

@ -4,6 +4,8 @@ use Moose;
use MooseX::Storage; use MooseX::Storage;
use Shapeways; use Shapeways;
use Shapeways::Logger;
with Storage(format => 'JSON'); with Storage(format => 'JSON');
has contents => ( has contents => (
@ -12,6 +14,16 @@ has contents => (
builder => '_download_item', builder => '_download_item',
); );
has image => (
is => 'ro',
lazy => 1,
builder => '_download_image',
);
has image_url => (
is => 'ro',
);
has download_url => ( has download_url => (
is => 'ro', is => 'ro',
default => undef default => undef
@ -34,18 +46,37 @@ has file_ext => (
default => '.zip', # will this ever be different? .stl maybe? default => '.zip', # will this ever be different? .stl maybe?
); );
sub _download_image {
my $self = shift;
log_info {"Downloading image"};
my $mech = Shapeways->mech;
my $res = $mech->get($self->image_url);
if ($res->is_success) {
# TODO check headers for extension
log_info {"Download successful"};
return $res->content;
} else {
log_error {"Failed to download: ", $res->code, " ", $res->status_line};
return undef;
}
}
sub _download_item { sub _download_item {
my $self = shift; my $self = shift;
my $mech = Shapeways->mech(); my $mech = Shapeways->mech();
my $base = Shapeways->base_url(); my $base = Shapeways->base_url();
log_info {"Downloading ", $self->title, "(", $self->shapeways_id, ") ", $self->download_url};
my $res = $mech->get($base . $self->download_url); my $res = $mech->get($base . $self->download_url);
if ($res->is_success) { if ($res->is_success) {
# TODO check headers for extension # TODO check headers for extension
log_info {"Download successful"};
return $res->content; return $res->content;
} else { } else {
warn "Failed to fetch: " . $self->freeze(); log_error {"Failed to download: ", $res->code, " ", $res->status_line};
return undef; return undef;
} }
} }
@ -62,20 +93,31 @@ sub serialize {
); );
# Download the file, rename it via the title, save a corrosponding .txt file with the description, title and original file name # Download the file, rename it via the title, save a corrosponding .txt file with the description, title and original file name
my $dir = $opts{path}.'/'.$opts{dirname}.'/'; log_info {"Saving to disk: ", $self->title, "(", $self->shapeways_id, ") ", $self->download_url};
my $dir = $opts{path}.'/'.$opts{dirname}.'/';
log_debug {"dir => ", $dir};
mkdir($dir) or die "Couldn't mkdir $dir => $!";
mkdir($dir) or die "Couldn't mkdir $dir => $!"; log_debug {"Saving main file"};
open(my $fh, '>', $dir.$opts{filebase}.$self->file_ext) or die "Couldn't write base file => $!";
print $fh $self->contents;
close($fh);
open(my $fh, '>', $dir.$opts{filebase}.$self->file_ext) or die "Couldn't write base file => $!"; log_debug {"Saving text file"};
print $fh $self->contents; open(my $descfh, '>', $dir.$opts{filebase}.'.txt') or die "Coouldn't open desc file => $!";
close($fh); my $header = $self->title . " - " . $self->shapeways_id;
print $descfh $header, "\n";
open(my $descfh, '>', $dir.$opts{filebase}.'.txt') or die "Coouldn't open desc file => $!"; print $descfh "="x(length($header)), "\n\n";
my $header = $self->title . " - " . $self->shapeways_id; print $descfh $self->description; # TODO html decoding? not sure
print $descfh $header, "\n"; close($descfh);
print $descfh "="x(length($header)), "\n\n";
print $descfh $self->description; # TODO html decoding? not sure log_debug {"Saving image file"};
close($descfh); open(my $imgfh, '>', $dir."folder.jpg") or die "Couldn't save image => $!";
print $imgfh $self->image;
close($imgfh);
# TODO fetch images
} }
sub new_from_url { sub new_from_url {
@ -89,6 +131,7 @@ sub new_from_url {
description => $tree_path->findvalue('//div[contains(@class,"product-description-content")]'), description => $tree_path->findvalue('//div[contains(@class,"product-description-content")]'),
title => $tree_path->findvalue('//h1[contains(@class,"product-title-header")]'), title => $tree_path->findvalue('//h1[contains(@class,"product-title-header")]'),
download_url => $tree_path->findvalue('//div[contains(@class,"product-page-download")]//a/@href'), download_url => $tree_path->findvalue('//div[contains(@class,"product-page-download")]//a/@href'),
image_url => ($tree_path->findvalues('//img[contains(@class, "film-strip-img")]/@src'))[0], # only get the first image
}; };
my $obj = Shapeways::Item->new($data); my $obj = Shapeways::Item->new($data);

12
lib/Shapeways/Logger.pm Normal file
View file

@ -0,0 +1,12 @@
package Shapeways::Logger;
use base 'Exporter';
use Log::Contextual::SimpleLogger;
use Log::Contextual qw( :log :dlog ),
-logger => Log::Contextual::SimpleLogger->new({ levels_upto => $ENV{LOG_LEVEL}||'info' });
my @levels = qw/trace info error debug warn fatal/;
our @EXPORT=map({"log_$_"} @levels), map({"Dlog_$_"} @levels);
1;