Capable of scraping all of an authors stuff
This commit is contained in:
parent
fad7ac8e68
commit
1fdefd9b07
5 changed files with 174 additions and 30 deletions
|
@ -5,8 +5,13 @@ use warnings;
|
|||
|
||||
use lib './lib';
|
||||
|
||||
use Data::Dumper;
|
||||
|
||||
use Shapeways::Account;
|
||||
use Shapeways::Item;
|
||||
|
||||
my $item = Shapeways::Item->new_from_url('http://www.shapeways.com/product/TCE9QM4VD/succubus?li=user-profile');
|
||||
my $item = Shapeways::Item->new_from_url('https://www.shapeways.com/product/TCE9QM4VD/succubus?li=user-profile');
|
||||
$item->serialize();
|
||||
#my $account = Shapeways::Account->new(account_name => 'mz4250');
|
||||
|
||||
$item->serialize;
|
||||
#$account->serialize;
|
|
@ -7,6 +7,7 @@ use HTML::TreeBuilder::XPath;
|
|||
use Data::Dumper;
|
||||
use JSON::MaybeXS;
|
||||
|
||||
use Shapeways::Logger;
|
||||
use Shapeways::Item;
|
||||
|
||||
class_has mech => (
|
||||
|
@ -30,13 +31,16 @@ sub get_url {
|
|||
my $self = shift;
|
||||
my ($url) = @_;
|
||||
|
||||
log_debug {"Fetching: ", $url};
|
||||
my $res = $self->mech->get($url);
|
||||
|
||||
if ($res->is_success) {
|
||||
my $tree_path = HTML::TreeBuilder::XPath->new_from_content($res->content);
|
||||
log_debug {"Fetch success"};
|
||||
|
||||
return ($tree_path, 200);
|
||||
} else {
|
||||
log_error {"Fetch failure => ", $res->code, " ", $res->status_line};
|
||||
return (undef, $res->code);
|
||||
}
|
||||
}
|
||||
|
@ -45,20 +49,17 @@ sub _build_mech {
|
|||
my $self = shift;
|
||||
my $mech = WWW::Mechanize::GZip->new();
|
||||
|
||||
log_info {"Logging in"};
|
||||
my $login = $mech->post('https://www.shapeways.com/login/json-shapeways', $self->config); # TODO better config?
|
||||
my $s = decode_json $login->content;
|
||||
|
||||
if ($s->{success}) {
|
||||
log_debug {"Logged in succesfully"};
|
||||
return $mech
|
||||
} else {
|
||||
log_debug {"Failed to log in: ", $login->content};
|
||||
die "login failed";
|
||||
}
|
||||
}
|
||||
|
||||
sub fetch_user {
|
||||
}
|
||||
|
||||
sub fetch_list { # Do these exist? maybe this is a cart?
|
||||
}
|
||||
|
||||
1;
|
||||
|
|
|
@ -1,17 +1,100 @@
|
|||
package Shapeways::Account;
|
||||
|
||||
use Moose;
|
||||
|
||||
use MooseX::Storage;
|
||||
use Shapeways;
|
||||
|
||||
use Shapeways::Logger;
|
||||
|
||||
use List::MoreUtils qw/uniq/;
|
||||
|
||||
use Data::Dumper;
|
||||
|
||||
with Storage(format => 'JSON');
|
||||
|
||||
has account_name => (
|
||||
is => 'ro',
|
||||
);
|
||||
|
||||
has account_id => (
|
||||
);
|
||||
|
||||
has items => (
|
||||
has creations => (
|
||||
is => 'ro',
|
||||
lazy => 1,
|
||||
default => sub {...}
|
||||
builder => '_get_creations',
|
||||
);
|
||||
|
||||
has favorites => (
|
||||
is => 'ro',
|
||||
lazy => 1,
|
||||
builder => '_get_favorites',
|
||||
);
|
||||
|
||||
# TODO this will make a list of their lists that contains items. probably wants a new class
|
||||
has lists => (
|
||||
is => 'ro',
|
||||
lazy => 1,
|
||||
builder => '_get_lists',
|
||||
);
|
||||
|
||||
sub _parse_page {
|
||||
my $self = shift;
|
||||
my $tree = shift;
|
||||
|
||||
# Gotta uniq this since the link is duplicated for text and pictures. easier to just do this than futz with the xpath
|
||||
my @items = uniq $tree->findvalues('//a[@class="product-url"]/@href');
|
||||
my $next_page = ($tree->findvalues('//a[span[contains(@class,"results-page-next")]]/@href'))[0];
|
||||
|
||||
return {
|
||||
items => \@items,
|
||||
next_page => $next_page,
|
||||
};
|
||||
}
|
||||
|
||||
sub _get_creations {
|
||||
my $self = shift;
|
||||
|
||||
# first page starts here
|
||||
my $next_page = 'http://www.shapeways.com/designer/'. $self->account_name;
|
||||
my @item_urls;
|
||||
|
||||
# Loop until we failed to find another page
|
||||
while ($next_page) {
|
||||
my ($tree_path, $code) = Shapeways->get_url($next_page);
|
||||
my $data = $self->_parse_page($tree_path);
|
||||
|
||||
push @item_urls, @{$data->{items}};
|
||||
$next_page = $data->{next_page};
|
||||
}
|
||||
|
||||
my @items = map {Shapeways::Item->new_from_url($_)} @item_urls;
|
||||
|
||||
return \@items;
|
||||
}
|
||||
|
||||
sub _get_lists {return undef};
|
||||
sub _get_favorites {return undef};
|
||||
|
||||
sub serialize {
|
||||
my $self = shift;
|
||||
my %opts = (
|
||||
mkdir => 1, # make a directory for the files
|
||||
dirname => $self->account_name,
|
||||
path => './',
|
||||
@_
|
||||
);
|
||||
# Download the file, rename it via the title, save a corrosponding .txt file with the description, title and original file name
|
||||
|
||||
log_info {"Making author dir: ", $self->account_name};
|
||||
|
||||
my $dir = $opts{path}.'/'.$opts{dirname}.'/';
|
||||
log_debug {"dir => ", $dir};
|
||||
|
||||
mkdir($dir) or die "Couldn't mkdir $dir => $!";
|
||||
|
||||
# TODO save more than the first item
|
||||
for my $item ($self->creations->[0]) {
|
||||
# Serialize under the author directory
|
||||
$item->serialize(path => $dir);
|
||||
}
|
||||
}
|
||||
|
||||
1;
|
|
@ -4,6 +4,8 @@ use Moose;
|
|||
use MooseX::Storage;
|
||||
use Shapeways;
|
||||
|
||||
use Shapeways::Logger;
|
||||
|
||||
with Storage(format => 'JSON');
|
||||
|
||||
has contents => (
|
||||
|
@ -12,6 +14,16 @@ has contents => (
|
|||
builder => '_download_item',
|
||||
);
|
||||
|
||||
has image => (
|
||||
is => 'ro',
|
||||
lazy => 1,
|
||||
builder => '_download_image',
|
||||
);
|
||||
|
||||
has image_url => (
|
||||
is => 'ro',
|
||||
);
|
||||
|
||||
has download_url => (
|
||||
is => 'ro',
|
||||
default => undef
|
||||
|
@ -34,18 +46,37 @@ has file_ext => (
|
|||
default => '.zip', # will this ever be different? .stl maybe?
|
||||
);
|
||||
|
||||
sub _download_image {
|
||||
my $self = shift;
|
||||
|
||||
log_info {"Downloading image"};
|
||||
my $mech = Shapeways->mech;
|
||||
my $res = $mech->get($self->image_url);
|
||||
|
||||
if ($res->is_success) {
|
||||
# TODO check headers for extension
|
||||
log_info {"Download successful"};
|
||||
return $res->content;
|
||||
} else {
|
||||
log_error {"Failed to download: ", $res->code, " ", $res->status_line};
|
||||
return undef;
|
||||
}
|
||||
}
|
||||
|
||||
sub _download_item {
|
||||
my $self = shift;
|
||||
my $mech = Shapeways->mech();
|
||||
|
||||
my $base = Shapeways->base_url();
|
||||
|
||||
log_info {"Downloading ", $self->title, "(", $self->shapeways_id, ") ", $self->download_url};
|
||||
my $res = $mech->get($base . $self->download_url);
|
||||
|
||||
if ($res->is_success) {
|
||||
# TODO check headers for extension
|
||||
log_info {"Download successful"};
|
||||
return $res->content;
|
||||
} else {
|
||||
warn "Failed to fetch: " . $self->freeze();
|
||||
log_error {"Failed to download: ", $res->code, " ", $res->status_line};
|
||||
return undef;
|
||||
}
|
||||
}
|
||||
|
@ -62,20 +93,31 @@ sub serialize {
|
|||
);
|
||||
# Download the file, rename it via the title, save a corrosponding .txt file with the description, title and original file name
|
||||
|
||||
log_info {"Saving to disk: ", $self->title, "(", $self->shapeways_id, ") ", $self->download_url};
|
||||
|
||||
my $dir = $opts{path}.'/'.$opts{dirname}.'/';
|
||||
log_debug {"dir => ", $dir};
|
||||
|
||||
mkdir($dir) or die "Couldn't mkdir $dir => $!";
|
||||
|
||||
log_debug {"Saving main file"};
|
||||
open(my $fh, '>', $dir.$opts{filebase}.$self->file_ext) or die "Couldn't write base file => $!";
|
||||
print $fh $self->contents;
|
||||
close($fh);
|
||||
|
||||
log_debug {"Saving text file"};
|
||||
open(my $descfh, '>', $dir.$opts{filebase}.'.txt') or die "Coouldn't open desc file => $!";
|
||||
my $header = $self->title . " - " . $self->shapeways_id;
|
||||
print $descfh $header, "\n";
|
||||
print $descfh "="x(length($header)), "\n\n";
|
||||
print $descfh $self->description; # TODO html decoding? not sure
|
||||
close($descfh);
|
||||
|
||||
log_debug {"Saving image file"};
|
||||
open(my $imgfh, '>', $dir."folder.jpg") or die "Couldn't save image => $!";
|
||||
print $imgfh $self->image;
|
||||
close($imgfh);
|
||||
# TODO fetch images
|
||||
}
|
||||
|
||||
sub new_from_url {
|
||||
|
@ -89,6 +131,7 @@ sub new_from_url {
|
|||
description => $tree_path->findvalue('//div[contains(@class,"product-description-content")]'),
|
||||
title => $tree_path->findvalue('//h1[contains(@class,"product-title-header")]'),
|
||||
download_url => $tree_path->findvalue('//div[contains(@class,"product-page-download")]//a/@href'),
|
||||
image_url => ($tree_path->findvalues('//img[contains(@class, "film-strip-img")]/@src'))[0], # only get the first image
|
||||
};
|
||||
|
||||
my $obj = Shapeways::Item->new($data);
|
||||
|
|
12
lib/Shapeways/Logger.pm
Normal file
12
lib/Shapeways/Logger.pm
Normal file
|
@ -0,0 +1,12 @@
|
|||
package Shapeways::Logger;
|
||||
|
||||
use base 'Exporter';
|
||||
|
||||
use Log::Contextual::SimpleLogger;
|
||||
use Log::Contextual qw( :log :dlog ),
|
||||
-logger => Log::Contextual::SimpleLogger->new({ levels_upto => $ENV{LOG_LEVEL}||'info' });
|
||||
|
||||
my @levels = qw/trace info error debug warn fatal/;
|
||||
our @EXPORT=map({"log_$_"} @levels), map({"Dlog_$_"} @levels);
|
||||
|
||||
1;
|
Loading…
Add table
Reference in a new issue