Capable of scraping all of an authors stuff
This commit is contained in:
parent
fad7ac8e68
commit
1fdefd9b07
5 changed files with 174 additions and 30 deletions
|
@ -5,8 +5,13 @@ use warnings;
|
||||||
|
|
||||||
use lib './lib';
|
use lib './lib';
|
||||||
|
|
||||||
|
use Data::Dumper;
|
||||||
|
|
||||||
|
use Shapeways::Account;
|
||||||
use Shapeways::Item;
|
use Shapeways::Item;
|
||||||
|
|
||||||
my $item = Shapeways::Item->new_from_url('http://www.shapeways.com/product/TCE9QM4VD/succubus?li=user-profile');
|
my $item = Shapeways::Item->new_from_url('https://www.shapeways.com/product/TCE9QM4VD/succubus?li=user-profile');
|
||||||
|
$item->serialize();
|
||||||
|
#my $account = Shapeways::Account->new(account_name => 'mz4250');
|
||||||
|
|
||||||
$item->serialize;
|
#$account->serialize;
|
|
@ -7,6 +7,7 @@ use HTML::TreeBuilder::XPath;
|
||||||
use Data::Dumper;
|
use Data::Dumper;
|
||||||
use JSON::MaybeXS;
|
use JSON::MaybeXS;
|
||||||
|
|
||||||
|
use Shapeways::Logger;
|
||||||
use Shapeways::Item;
|
use Shapeways::Item;
|
||||||
|
|
||||||
class_has mech => (
|
class_has mech => (
|
||||||
|
@ -30,13 +31,16 @@ sub get_url {
|
||||||
my $self = shift;
|
my $self = shift;
|
||||||
my ($url) = @_;
|
my ($url) = @_;
|
||||||
|
|
||||||
|
log_debug {"Fetching: ", $url};
|
||||||
my $res = $self->mech->get($url);
|
my $res = $self->mech->get($url);
|
||||||
|
|
||||||
if ($res->is_success) {
|
if ($res->is_success) {
|
||||||
my $tree_path = HTML::TreeBuilder::XPath->new_from_content($res->content);
|
my $tree_path = HTML::TreeBuilder::XPath->new_from_content($res->content);
|
||||||
|
log_debug {"Fetch success"};
|
||||||
|
|
||||||
return ($tree_path, 200);
|
return ($tree_path, 200);
|
||||||
} else {
|
} else {
|
||||||
|
log_error {"Fetch failure => ", $res->code, " ", $res->status_line};
|
||||||
return (undef, $res->code);
|
return (undef, $res->code);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -45,20 +49,17 @@ sub _build_mech {
|
||||||
my $self = shift;
|
my $self = shift;
|
||||||
my $mech = WWW::Mechanize::GZip->new();
|
my $mech = WWW::Mechanize::GZip->new();
|
||||||
|
|
||||||
|
log_info {"Logging in"};
|
||||||
my $login = $mech->post('https://www.shapeways.com/login/json-shapeways', $self->config); # TODO better config?
|
my $login = $mech->post('https://www.shapeways.com/login/json-shapeways', $self->config); # TODO better config?
|
||||||
my $s = decode_json $login->content;
|
my $s = decode_json $login->content;
|
||||||
|
|
||||||
if ($s->{success}) {
|
if ($s->{success}) {
|
||||||
|
log_debug {"Logged in succesfully"};
|
||||||
return $mech
|
return $mech
|
||||||
} else {
|
} else {
|
||||||
|
log_debug {"Failed to log in: ", $login->content};
|
||||||
die "login failed";
|
die "login failed";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sub fetch_user {
|
|
||||||
}
|
|
||||||
|
|
||||||
sub fetch_list { # Do these exist? maybe this is a cart?
|
|
||||||
}
|
|
||||||
|
|
||||||
1;
|
1;
|
||||||
|
|
|
@ -1,17 +1,100 @@
|
||||||
package Shapeways::Account;
|
package Shapeways::Account;
|
||||||
|
|
||||||
use Moose;
|
use Moose;
|
||||||
|
use MooseX::Storage;
|
||||||
use Shapeways;
|
use Shapeways;
|
||||||
|
|
||||||
|
use Shapeways::Logger;
|
||||||
|
|
||||||
|
use List::MoreUtils qw/uniq/;
|
||||||
|
|
||||||
|
use Data::Dumper;
|
||||||
|
|
||||||
|
with Storage(format => 'JSON');
|
||||||
|
|
||||||
has account_name => (
|
has account_name => (
|
||||||
|
is => 'ro',
|
||||||
);
|
);
|
||||||
|
|
||||||
has account_id => (
|
has creations => (
|
||||||
);
|
|
||||||
|
|
||||||
has items => (
|
|
||||||
is => 'ro',
|
is => 'ro',
|
||||||
lazy => 1,
|
lazy => 1,
|
||||||
default => sub {...}
|
builder => '_get_creations',
|
||||||
);
|
);
|
||||||
|
|
||||||
|
has favorites => (
|
||||||
|
is => 'ro',
|
||||||
|
lazy => 1,
|
||||||
|
builder => '_get_favorites',
|
||||||
|
);
|
||||||
|
|
||||||
|
# TODO this will make a list of their lists that contains items. probably wants a new class
|
||||||
|
has lists => (
|
||||||
|
is => 'ro',
|
||||||
|
lazy => 1,
|
||||||
|
builder => '_get_lists',
|
||||||
|
);
|
||||||
|
|
||||||
|
sub _parse_page {
|
||||||
|
my $self = shift;
|
||||||
|
my $tree = shift;
|
||||||
|
|
||||||
|
# Gotta uniq this since the link is duplicated for text and pictures. easier to just do this than futz with the xpath
|
||||||
|
my @items = uniq $tree->findvalues('//a[@class="product-url"]/@href');
|
||||||
|
my $next_page = ($tree->findvalues('//a[span[contains(@class,"results-page-next")]]/@href'))[0];
|
||||||
|
|
||||||
|
return {
|
||||||
|
items => \@items,
|
||||||
|
next_page => $next_page,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
sub _get_creations {
|
||||||
|
my $self = shift;
|
||||||
|
|
||||||
|
# first page starts here
|
||||||
|
my $next_page = 'http://www.shapeways.com/designer/'. $self->account_name;
|
||||||
|
my @item_urls;
|
||||||
|
|
||||||
|
# Loop until we failed to find another page
|
||||||
|
while ($next_page) {
|
||||||
|
my ($tree_path, $code) = Shapeways->get_url($next_page);
|
||||||
|
my $data = $self->_parse_page($tree_path);
|
||||||
|
|
||||||
|
push @item_urls, @{$data->{items}};
|
||||||
|
$next_page = $data->{next_page};
|
||||||
|
}
|
||||||
|
|
||||||
|
my @items = map {Shapeways::Item->new_from_url($_)} @item_urls;
|
||||||
|
|
||||||
|
return \@items;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub _get_lists {return undef};
|
||||||
|
sub _get_favorites {return undef};
|
||||||
|
|
||||||
|
sub serialize {
|
||||||
|
my $self = shift;
|
||||||
|
my %opts = (
|
||||||
|
mkdir => 1, # make a directory for the files
|
||||||
|
dirname => $self->account_name,
|
||||||
|
path => './',
|
||||||
|
@_
|
||||||
|
);
|
||||||
|
# Download the file, rename it via the title, save a corrosponding .txt file with the description, title and original file name
|
||||||
|
|
||||||
|
log_info {"Making author dir: ", $self->account_name};
|
||||||
|
|
||||||
|
my $dir = $opts{path}.'/'.$opts{dirname}.'/';
|
||||||
|
log_debug {"dir => ", $dir};
|
||||||
|
|
||||||
|
mkdir($dir) or die "Couldn't mkdir $dir => $!";
|
||||||
|
|
||||||
|
# TODO save more than the first item
|
||||||
|
for my $item ($self->creations->[0]) {
|
||||||
|
# Serialize under the author directory
|
||||||
|
$item->serialize(path => $dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
1;
|
|
@ -4,6 +4,8 @@ use Moose;
|
||||||
use MooseX::Storage;
|
use MooseX::Storage;
|
||||||
use Shapeways;
|
use Shapeways;
|
||||||
|
|
||||||
|
use Shapeways::Logger;
|
||||||
|
|
||||||
with Storage(format => 'JSON');
|
with Storage(format => 'JSON');
|
||||||
|
|
||||||
has contents => (
|
has contents => (
|
||||||
|
@ -12,6 +14,16 @@ has contents => (
|
||||||
builder => '_download_item',
|
builder => '_download_item',
|
||||||
);
|
);
|
||||||
|
|
||||||
|
has image => (
|
||||||
|
is => 'ro',
|
||||||
|
lazy => 1,
|
||||||
|
builder => '_download_image',
|
||||||
|
);
|
||||||
|
|
||||||
|
has image_url => (
|
||||||
|
is => 'ro',
|
||||||
|
);
|
||||||
|
|
||||||
has download_url => (
|
has download_url => (
|
||||||
is => 'ro',
|
is => 'ro',
|
||||||
default => undef
|
default => undef
|
||||||
|
@ -34,18 +46,37 @@ has file_ext => (
|
||||||
default => '.zip', # will this ever be different? .stl maybe?
|
default => '.zip', # will this ever be different? .stl maybe?
|
||||||
);
|
);
|
||||||
|
|
||||||
|
sub _download_image {
|
||||||
|
my $self = shift;
|
||||||
|
|
||||||
|
log_info {"Downloading image"};
|
||||||
|
my $mech = Shapeways->mech;
|
||||||
|
my $res = $mech->get($self->image_url);
|
||||||
|
|
||||||
|
if ($res->is_success) {
|
||||||
|
# TODO check headers for extension
|
||||||
|
log_info {"Download successful"};
|
||||||
|
return $res->content;
|
||||||
|
} else {
|
||||||
|
log_error {"Failed to download: ", $res->code, " ", $res->status_line};
|
||||||
|
return undef;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
sub _download_item {
|
sub _download_item {
|
||||||
my $self = shift;
|
my $self = shift;
|
||||||
my $mech = Shapeways->mech();
|
my $mech = Shapeways->mech();
|
||||||
|
|
||||||
my $base = Shapeways->base_url();
|
my $base = Shapeways->base_url();
|
||||||
|
|
||||||
|
log_info {"Downloading ", $self->title, "(", $self->shapeways_id, ") ", $self->download_url};
|
||||||
my $res = $mech->get($base . $self->download_url);
|
my $res = $mech->get($base . $self->download_url);
|
||||||
|
|
||||||
if ($res->is_success) {
|
if ($res->is_success) {
|
||||||
# TODO check headers for extension
|
# TODO check headers for extension
|
||||||
|
log_info {"Download successful"};
|
||||||
return $res->content;
|
return $res->content;
|
||||||
} else {
|
} else {
|
||||||
warn "Failed to fetch: " . $self->freeze();
|
log_error {"Failed to download: ", $res->code, " ", $res->status_line};
|
||||||
return undef;
|
return undef;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -62,20 +93,31 @@ sub serialize {
|
||||||
);
|
);
|
||||||
# Download the file, rename it via the title, save a corrosponding .txt file with the description, title and original file name
|
# Download the file, rename it via the title, save a corrosponding .txt file with the description, title and original file name
|
||||||
|
|
||||||
my $dir = $opts{path}.'/'.$opts{dirname}.'/';
|
log_info {"Saving to disk: ", $self->title, "(", $self->shapeways_id, ") ", $self->download_url};
|
||||||
|
|
||||||
|
my $dir = $opts{path}.'/'.$opts{dirname}.'/';
|
||||||
|
log_debug {"dir => ", $dir};
|
||||||
|
|
||||||
|
mkdir($dir) or die "Couldn't mkdir $dir => $!";
|
||||||
|
|
||||||
mkdir($dir) or die "Couldn't mkdir $dir => $!";
|
log_debug {"Saving main file"};
|
||||||
|
open(my $fh, '>', $dir.$opts{filebase}.$self->file_ext) or die "Couldn't write base file => $!";
|
||||||
|
print $fh $self->contents;
|
||||||
|
close($fh);
|
||||||
|
|
||||||
open(my $fh, '>', $dir.$opts{filebase}.$self->file_ext) or die "Couldn't write base file => $!";
|
log_debug {"Saving text file"};
|
||||||
print $fh $self->contents;
|
open(my $descfh, '>', $dir.$opts{filebase}.'.txt') or die "Coouldn't open desc file => $!";
|
||||||
close($fh);
|
my $header = $self->title . " - " . $self->shapeways_id;
|
||||||
|
print $descfh $header, "\n";
|
||||||
open(my $descfh, '>', $dir.$opts{filebase}.'.txt') or die "Coouldn't open desc file => $!";
|
print $descfh "="x(length($header)), "\n\n";
|
||||||
my $header = $self->title . " - " . $self->shapeways_id;
|
print $descfh $self->description; # TODO html decoding? not sure
|
||||||
print $descfh $header, "\n";
|
close($descfh);
|
||||||
print $descfh "="x(length($header)), "\n\n";
|
|
||||||
print $descfh $self->description; # TODO html decoding? not sure
|
log_debug {"Saving image file"};
|
||||||
close($descfh);
|
open(my $imgfh, '>', $dir."folder.jpg") or die "Couldn't save image => $!";
|
||||||
|
print $imgfh $self->image;
|
||||||
|
close($imgfh);
|
||||||
|
# TODO fetch images
|
||||||
}
|
}
|
||||||
|
|
||||||
sub new_from_url {
|
sub new_from_url {
|
||||||
|
@ -89,6 +131,7 @@ sub new_from_url {
|
||||||
description => $tree_path->findvalue('//div[contains(@class,"product-description-content")]'),
|
description => $tree_path->findvalue('//div[contains(@class,"product-description-content")]'),
|
||||||
title => $tree_path->findvalue('//h1[contains(@class,"product-title-header")]'),
|
title => $tree_path->findvalue('//h1[contains(@class,"product-title-header")]'),
|
||||||
download_url => $tree_path->findvalue('//div[contains(@class,"product-page-download")]//a/@href'),
|
download_url => $tree_path->findvalue('//div[contains(@class,"product-page-download")]//a/@href'),
|
||||||
|
image_url => ($tree_path->findvalues('//img[contains(@class, "film-strip-img")]/@src'))[0], # only get the first image
|
||||||
};
|
};
|
||||||
|
|
||||||
my $obj = Shapeways::Item->new($data);
|
my $obj = Shapeways::Item->new($data);
|
||||||
|
|
12
lib/Shapeways/Logger.pm
Normal file
12
lib/Shapeways/Logger.pm
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
package Shapeways::Logger;
|
||||||
|
|
||||||
|
use base 'Exporter';
|
||||||
|
|
||||||
|
use Log::Contextual::SimpleLogger;
|
||||||
|
use Log::Contextual qw( :log :dlog ),
|
||||||
|
-logger => Log::Contextual::SimpleLogger->new({ levels_upto => $ENV{LOG_LEVEL}||'info' });
|
||||||
|
|
||||||
|
my @levels = qw/trace info error debug warn fatal/;
|
||||||
|
our @EXPORT=map({"log_$_"} @levels), map({"Dlog_$_"} @levels);
|
||||||
|
|
||||||
|
1;
|
Loading…
Add table
Reference in a new issue