#!/usr/bin/perl

use DBI;
use strict;

my $dbh;
my $dryrun = 0;
my %genre = (
'none', 			'none',
'Misc', 			'Misc',
'Comics & Graphic Novels', 	'Comics',
'Games', 			'Comics',
'Language Arts & Disciplines',	'Art',
'Art', 				'Art',
'Music', 			'Art',
'Sports & Recreation', 		'Recreation',
'Cooking',			'Cooking',
'Literature',			'Literature',
'Mathematics', 		'Science',
'Technology', 		'Science',
'Science', 		'Science',
'Political Science', 	'Science',
'Business & Economics', 'Science',
'Psychology', 		'Science',
'Computer viruses', 	'Computer',
'Computers', 		'Computer',
'Computer programming', 'Computer',
'Juvenile Fiction', 	'Fiction',
'Fiction', 		'Fiction',
'Humor', 		'Fiction',
'Science Fiction',	'Science Fiction',
'Fantasy',		'Fantasy',
'dictionary',		'Encyclopedia',
'Travel',		'Travel'
	);

sub normalize_genre {
    my $g = shift;
    if ($genre{$g}) {
	print "** Genre genormaliseerd naar $genre{$g}\n";
	return $genre{$g};
    } else {
	return $g;
    }
}

# title:author:genre:year of first publication
sub details {
    my @html = @_;

    my @author = ();
    my @details = ();
    my $i = 0;
    my $j = 0;
    my $s = 0;
    # read ISBN numbers
    foreach (@html) {
	next if /^[ \t]*$/;
	s/^[ \t]+//;
#	print;
	# 3 of 3 on 9780201541991. (0.01 seconds)
	if (/(\d) of (\d) on/) {
	    my $aantal = $2;
	    if ($aantal gt 1) {
		print "** Meer dan 1 boek gevonden, neem de eerste hit\n";
#		return ();
	    }
	}
	if ($i) {
	    if (/sponsored links/i) {
		print "** Sponsered link, continue\n";
		$s = 1;
		next;
	    }

	    if ($s == 1) {
		# reclame is altijd van bol.com... dit veranderd nog wel eens dus!
		if (/www.bol.com/) {
		    $s = 0;
		}
		next;
	    }

	    if ($j == 0) {
		# title
		chomp;
		s/[ \t]+$//;
		s/^ ?//;
		# may some times contain - Page xxx
		s/ - Page.*$//;
		push @details, $_;
	    } elsif ($j == 1) {
		chomp;
		s/[ \t]+$//;
		s/^by //;
		@author = split /[ \t]+-[ \t]+/;
		# it happens that there is no genre, in that case we find the year in the genre 
		# fix this
		if ($author[1] =~ /^\d+$/) {
		    ($author[2], $author[1]) = ($author[1], "");
		}
		if ($author[1] eq "") {
		    # no genre
		    $author[1] = "none";
		}
		push @details, @author[0..2];
	    }
	    $j++;
	}
	if (/list.*view.*cover.*view/i) { $i = 1; }
	return @details if $j > 1;
    }
    return ();
}

sub retrieve {
    my $isbn = shift;
    my $sth = $dbh->prepare('SELECT * FROM books WHERE isbn = ?')
	or die "Couldn't prepare statement: " . $dbh->errstr;
    $sth->execute($isbn);
    while (my @data = $sth->fetchrow_array()) {
	#print "@data" . "\n";
    }
# ... We have to do this after the while loop that fetches whatever rows were available, because with some
# databases you don't know how many rows there were until after you've gotten them all.
    my $row = $sth->rows;
    $sth->finish;
    if ($dryrun == 0) {
        return ($row == 0);
    } else {
	return 1;
    }
}

sub insert {
    my ($isbn, $title, $author, $genre, $year, $epoch) = @_;

    $genre = lc normalize_genre $genre;
    print join "|", $title, $author, $genre, $year, $epoch . "\n";
    if ($dryrun == 0) {
	my $sth = $dbh->prepare('INSERT INTO books VALUES (?, ?, ?, ?, ?, ?, ?, ?)')
	    or die "Couldn't prepare statement: " . $dbh->errstr;
	$sth->execute(undef, $isbn, "$title", "$author", "$genre", $year, $epoch, undef) 
	    or die "Couldn't insert book: " . $dbh->errstr;
	$sth->finish;
    } else {
	print "** No commit\n";
    }
}

$dbh = DBI->connect("dbi:SQLite:dbname=biblio.db", "", "");
# optional argument
if ($ARGV[0] eq "-n") {
    print "** Dryrun, geen DB commits\n";
    $dryrun = 1;
    shift;
}
if ($ARGV[0] eq "-g") {
    # print genres
    my %uniq;
    foreach (values %genre) {
	$uniq{lc $_} = 1;
    }
    print join "\n", keys %uniq;
    #print "\n";
    exit 0;
}

# read isbn numbers
while(<>) {
    chomp;
    if (! /^\d+$/) {
	print "** Ongeldig\n";
	next;
    }
    my @html = `links -width 200 -dump "http://books.google.com/books?q=+$_&btnG=Search+Books"`;
# title,author,genre,year of first publication
    my @det = details(@html);
    if (scalar @det == 0) {
	print "** Onbekend\n";
    } else {
	my $epoch = time();

	# genre
	if ($det[2] =~ /pages/) {
	    # problably couldn't fetch it
	    $det[2] = "none";
	}
	# first published, if empty, make it up
	if ($det[3] eq "") {
	    $det[3] = "0";
	}

	if (retrieve $_) {
	    insert $_, $det[0], $det[1], $det[2], $det[3], $epoch;
	} else {
	    print "** Boek al aanwezig: $_: $det[0]\n";
	}
    }
}
