You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

245 lines
5.7 KiB

#!/usr/bin/perl -w
# (c) Copyright by authors of the Tiki Wiki CMS Groupware Project
#
# All Rights Reserved. See copyright.txt for details and a complete list of authors.
# Licensed under the GNU LESSER GENERAL PUBLIC LICENSE. See license.txt for details.
# $Id$
#
# This script helps you recover your wiki data from google cache if
# you somehow lose it.
# The tipical situation is that you have an old backup, so this script will
# crawl google cache for source code of wiki pages and put them in database.
# Don't forget to backup first.
use strict;
use LWP::UserAgent;
use CGI::Simple;
use DBI;
# results per page in google search
our $increment = 100;
# directory to dump all cached pages
our $dumpdir = "wiki_dump";
# url of your site
our $siteurl = "mydomain.com";
# list of important pages you remember by name, helps finding them
our @pages = qw(HomePage User+List Some+Page);
# configure database connection here
our $dbh = DBI->connect("dbi:mysql:database_name:localhost","root");
# tiki file patterns to be searched, you don't have to touch this
our @phpfiles = ('source "tiki pagehistory php "',
'source=0 "tiki pagehistory php "'
# ,'tiki-index.php'
);
our $cgi = new CGI::Simple;
our $ua = LWP::UserAgent->new;
chdir($dumpdir);
# this will fetch all results and dump each one to a file in dump dir
fetch();
# this will parse all pages and import best version to database, and put
# earlier version of page in history.
importDump();
sub importDump {
my %pages = %{getBestVersions()};
my $getPage = $dbh->prepare("select * from tiki_pages where pageName=?");
my $insertHistory = $dbh->prepare("insert into tiki_history values (?,?,?,?,?,?,?,?,?)");
my $updatePage = $dbh->prepare("update tiki_pages set ".
" data=?, ".
" lastModif=?, ".
" comment=?, ".
" version=?, ".
" user=? ".
" where pageName=?");
foreach my $page (keys %pages) {
my $version;
if (pageExists($page)) {
$getPage->execute($page);
my $info = $getPage->fetchrow_hashref;
$version = $info->{version} + 1;
$insertHistory->execute($page,
$info->{version},
0,
$info->{lastModif},
$info->{description},
$info->{user},
$info->{ip},
$info->{comment},
$info->{data});
} else {
$dbh->do("insert into tiki_pages (pageName) values (".$dbh->quote($page).")");
$version = 1;
}
# " data=?, ".
# " description=?, ".
# " lastModif=?, ".
# " comment=?, ".
# " version=?, ".
# " user=? ".
# " where pageName=?");#
my $info = $pages{$page};
$updatePage->execute($info->{content},
time(),
'recuperacao do cache do google',
$version,
'admin',
$page);
}
}
sub pageExists {
my $page = shift;
my ($id) = $dbh->selectrow_array("select page_id from tiki_pages where pageName=".$dbh->quote($page));
return $id ? 1 : 0;
}
sub getBestVersions {
my %data;
foreach my $file (<*>) {
my $info = getInfoFromFile($file);
if (!defined $data{$info->{page}} ||
$info->{version} == 0 ||
$data{$info->{page}}{version} < $info->{version}) {
$data{$info->{page}} = $info;
}
}
return \%data;
}
1;
sub getInfoFromFile {
my $file = shift;
my ($page, $source) = $cgi->url_decode($file) =~ /page=(.+?)\&.*?source=(\d+)/;
$page && defined $source
or return undef;
$page =~ s/\%([0-9A-Fa-f]{2})/chr(hex($1))/ge;
undef $/;
open ARQ, $file
or die $!;
my $content = <ARQ>;
close ARQ;
$content =~ s|^.+?<div[^>]+class="wikitext">(.+?)</div>.+$|$1|s;
$content =~ s/\r//gs;
$content =~ s|<br />||gs;
return {
'page' => $page,
'content' => $content,
'version' => $source
};
}
sub fetch {
$ua->timeout(30);
$ua->cookie_jar({});
$ua->agent("Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR; rv:1.8.0.1) Gecko/20060111 Firefox/1.5.0.1");
my $response = $ua->get('http://www.google.com/');
sleep 2;
my @queries;
foreach my $page (@pages) {
foreach my $phpfile (@phpfiles) {
push @queries, "$page $phpfile";
}
}
foreach my $query (@queries) {
my $offset = 0;
while (my @links = getList($query, $offset)) {
foreach my $link (@links) {
retrieveLink($link);
}
$offset += $increment;
}
}
}
sub retrieveLink {
my $link = shift;
my ($file) = $link =~ m|$siteurl/(.+)$|;
if (-f $file) {
open ARQ, ">>$file.repeated";
print ARQ ".";
close ARQ;
return 1;
}
my $response = $ua->get($link);
$response->is_success
or return undef;
open ARQ, ">$file";
print ARQ $response->content;
close ARQ;
}
sub getList {
my $query = shift;
my $start = shift;
$query = $cgi->url_encode($query);
my $url = 'http://www.google.com.br/search?q='.$query.'+site:'.$siteurl.'&num=100&hl=pt-BR&lr=&as_qdr=all&filter=0';
if ($start) {
$url .= '&sa=N&start='.$start;
}
my $response = $ua->get($url);
my $content;
if ($response->is_success) {
$content = $response->content; # or whatever
} else {
return 0;
}
my @links;
while (my ($siteUrl) = $content =~ m|href=\"(http://[0-9.]+/search\?q=cache[^\"]+)\"|) {
push @links, $siteUrl;
$content =~ s|href=\"(http://[0-9.]+/search\?q=cache[^\"]+)\"||;
}
return @links;
}