#!/usr/bin/perl -w # (c) Copyright by authors of the Tiki Wiki CMS Groupware Project # # All Rights Reserved. See copyright.txt for details and a complete list of authors. # Licensed under the GNU LESSER GENERAL PUBLIC LICENSE. See license.txt for details. # $Id$ # # This script helps you recover your wiki data from google cache if # you somehow lose it. # The tipical situation is that you have an old backup, so this script will # crawl google cache for source code of wiki pages and put them in database. # Don't forget to backup first. use strict; use LWP::UserAgent; use CGI::Simple; use DBI; # results per page in google search our $increment = 100; # directory to dump all cached pages our $dumpdir = "wiki_dump"; # url of your site our $siteurl = "mydomain.com"; # list of important pages you remember by name, helps finding them our @pages = qw(HomePage User+List Some+Page); # configure database connection here our $dbh = DBI->connect("dbi:mysql:database_name:localhost","root"); # tiki file patterns to be searched, you don't have to touch this our @phpfiles = ('source "tiki pagehistory php "', 'source=0 "tiki pagehistory php "' # ,'tiki-index.php' ); our $cgi = new CGI::Simple; our $ua = LWP::UserAgent->new; chdir($dumpdir); # this will fetch all results and dump each one to a file in dump dir fetch(); # this will parse all pages and import best version to database, and put # earlier version of page in history. importDump(); sub importDump { my %pages = %{getBestVersions()}; my $getPage = $dbh->prepare("select * from tiki_pages where pageName=?"); my $insertHistory = $dbh->prepare("insert into tiki_history values (?,?,?,?,?,?,?,?,?)"); my $updatePage = $dbh->prepare("update tiki_pages set ". " data=?, ". " lastModif=?, ". " comment=?, ". " version=?, ". " user=? ". " where pageName=?"); foreach my $page (keys %pages) { my $version; if (pageExists($page)) { $getPage->execute($page); my $info = $getPage->fetchrow_hashref; $version = $info->{version} + 1; $insertHistory->execute($page, $info->{version}, 0, $info->{lastModif}, $info->{description}, $info->{user}, $info->{ip}, $info->{comment}, $info->{data}); } else { $dbh->do("insert into tiki_pages (pageName) values (".$dbh->quote($page).")"); $version = 1; } # " data=?, ". # " description=?, ". # " lastModif=?, ". # " comment=?, ". # " version=?, ". # " user=? ". # " where pageName=?");# my $info = $pages{$page}; $updatePage->execute($info->{content}, time(), 'recuperacao do cache do google', $version, 'admin', $page); } } sub pageExists { my $page = shift; my ($id) = $dbh->selectrow_array("select page_id from tiki_pages where pageName=".$dbh->quote($page)); return $id ? 1 : 0; } sub getBestVersions { my %data; foreach my $file (<*>) { my $info = getInfoFromFile($file); if (!defined $data{$info->{page}} || $info->{version} == 0 || $data{$info->{page}}{version} < $info->{version}) { $data{$info->{page}} = $info; } } return \%data; } 1; sub getInfoFromFile { my $file = shift; my ($page, $source) = $cgi->url_decode($file) =~ /page=(.+?)\&.*?source=(\d+)/; $page && defined $source or return undef; $page =~ s/\%([0-9A-Fa-f]{2})/chr(hex($1))/ge; undef $/; open ARQ, $file or die $!; my $content = ; close ARQ; $content =~ s|^.+?]+class="wikitext">(.+?).+$|$1|s; $content =~ s/\r//gs; $content =~ s|
||gs; return { 'page' => $page, 'content' => $content, 'version' => $source }; } sub fetch { $ua->timeout(30); $ua->cookie_jar({}); $ua->agent("Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR; rv:1.8.0.1) Gecko/20060111 Firefox/1.5.0.1"); my $response = $ua->get('http://www.google.com/'); sleep 2; my @queries; foreach my $page (@pages) { foreach my $phpfile (@phpfiles) { push @queries, "$page $phpfile"; } } foreach my $query (@queries) { my $offset = 0; while (my @links = getList($query, $offset)) { foreach my $link (@links) { retrieveLink($link); } $offset += $increment; } } } sub retrieveLink { my $link = shift; my ($file) = $link =~ m|$siteurl/(.+)$|; if (-f $file) { open ARQ, ">>$file.repeated"; print ARQ "."; close ARQ; return 1; } my $response = $ua->get($link); $response->is_success or return undef; open ARQ, ">$file"; print ARQ $response->content; close ARQ; } sub getList { my $query = shift; my $start = shift; $query = $cgi->url_encode($query); my $url = 'http://www.google.com.br/search?q='.$query.'+site:'.$siteurl.'&num=100&hl=pt-BR&lr=&as_qdr=all&filter=0'; if ($start) { $url .= '&sa=N&start='.$start; } my $response = $ua->get($url); my $content; if ($response->is_success) { $content = $response->content; # or whatever } else { return 0; } my @links; while (my ($siteUrl) = $content =~ m|href=\"(http://[0-9.]+/search\?q=cache[^\"]+)\"|) { push @links, $siteUrl; $content =~ s|href=\"(http://[0-9.]+/search\?q=cache[^\"]+)\"||; } return @links; }