creylopez
/
k8s-cluster-projects


								#!/usr/bin/perl -w

								# (c) Copyright by authors of the Tiki Wiki CMS Groupware Project

								#

								# All Rights Reserved. See copyright.txt for details and a complete list of authors.

								# Licensed under the GNU LESSER GENERAL PUBLIC LICENSE. See license.txt for details.

								# $Id$


								#

								# This script helps you recover your wiki data from google cache if

								# you somehow lose it.

								# The tipical situation is that you have an old backup, so this script will

								# crawl google cache for source code of wiki pages and put them in database.

								# Don't forget to backup first.


								use strict;

								use LWP::UserAgent;

								use CGI::Simple;

								use DBI;


								# results per page in google search

								our $increment = 100;


								# directory to dump all cached pages

								our $dumpdir = "wiki_dump";


								# url of your site

								our $siteurl = "mydomain.com";


								# list of important pages you remember by name, helps finding them

								our @pages = qw(HomePage User+List Some+Page);


								# configure database connection here

								our $dbh = DBI->connect("dbi:mysql:database_name:localhost","root");


								# tiki file patterns to be searched, you don't have to touch this

								our @phpfiles = ('source "tiki pagehistory php "',

								         'source=0 "tiki pagehistory php "'

								#         ,'tiki-index.php'

								         );


								our $cgi = new CGI::Simple;

								our $ua = LWP::UserAgent->new;


								chdir($dumpdir);


								# this will fetch all results and dump each one to a file in dump dir

								fetch();


								# this will parse all pages and import best version to database, and put

								# earlier version of page in history.

								importDump();


								sub importDump {

								    my %pages = %{getBestVersions()};


								    my $getPage = $dbh->prepare("select * from tiki_pages where pageName=?");

								    my $insertHistory = $dbh->prepare("insert into tiki_history values (?,?,?,?,?,?,?,?,?)");

								    my $updatePage = $dbh->prepare("update tiki_pages set ".

								                   " data=?, ".

								                   " lastModif=?, ".

								                   " comment=?, ".

								                   " version=?, ".

								                   " user=? ".

								                   " where pageName=?");


								    foreach my $page (keys %pages) {

								    my $version;


								    if (pageExists($page)) {


								        $getPage->execute($page);

								        my $info = $getPage->fetchrow_hashref;


								        $version = $info->{version} + 1;


								        $insertHistory->execute($page,

								                    $info->{version},

								                    0,

								                    $info->{lastModif},

								                    $info->{description},

								                    $info->{user},

								                    $info->{ip},

								                    $info->{comment},

								                    $info->{data});

								    } else {

								        $dbh->do("insert into tiki_pages (pageName) values (".$dbh->quote($page).")");

								        $version = 1;

								    }


								    #               " data=?, ".

								    #               " description=?, ".

								    #               " lastModif=?, ".

								    #               " comment=?, ".

								    #               " version=?, ".

								    #               " user=? ".

								    #               " where pageName=?");#


								    my $info = $pages{$page};

								    $updatePage->execute($info->{content},

								                 time(),

								                 'recuperacao do cache do google',

								                 $version,

								                 'admin',

								                 $page);

								    }


								}


								sub pageExists {

								    my $page = shift;


								    my ($id) = $dbh->selectrow_array("select page_id from tiki_pages where pageName=".$dbh->quote($page));


								    return $id ? 1 : 0;

								}


								sub getBestVersions {


								    my %data;

								    foreach my $file (<*>) {

								    my $info = getInfoFromFile($file);


								    if (!defined $data{$info->{page}} ||

								        $info->{version} == 0 ||

								        $data{$info->{page}}{version} < $info->{version}) {


								        $data{$info->{page}} = $info;

								    }

								    }


								    return \%data;

								}


								1;


								sub getInfoFromFile {

								    my $file = shift;


								    my ($page, $source) = $cgi->url_decode($file) =~ /page=(.+?)\&.*?source=(\d+)/;


								    $page && defined $source

								    or return undef;


								    $page =~ s/\%([0-9A-Fa-f]{2})/chr(hex($1))/ge;


								    undef $/;

								    open ARQ, $file

								    or die $!;

								    my $content = <ARQ>;

								    close ARQ;


								    $content =~ s|^.+?<div[^>]+class="wikitext">(.+?)</div>.+$|$1|s;


								    $content =~ s/\r//gs;

								    $content =~ s|<br />||gs;


								    return {

								    'page' => $page,

								    'content' => $content,

								    'version' => $source

								    };

								}


								sub fetch {


								    $ua->timeout(30);

								    $ua->cookie_jar({});

								    $ua->agent("Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR; rv:1.8.0.1) Gecko/20060111 Firefox/1.5.0.1");

								    my $response = $ua->get('http://www.google.com/');


								    sleep 2;


								    my @queries;


								    foreach my $page (@pages) {

								    foreach my $phpfile (@phpfiles) {

								        push @queries, "$page $phpfile";

								    }

								    }


								    foreach my $query (@queries) {


								    my $offset = 0;


								    while (my @links = getList($query, $offset)) {

								        foreach my $link (@links) {

								        retrieveLink($link);

								        }

								        $offset += $increment;

								    }

								    }

								}


								sub retrieveLink {

								    my $link = shift;


								    my ($file) = $link =~ m|$siteurl/(.+)$|;


								    if (-f $file) {

								    open ARQ, ">>$file.repeated";

								    print ARQ ".";

								    close ARQ;

								    return 1;

								    }


								    my $response = $ua->get($link);

								    $response->is_success

								    or return undef;


								    open ARQ, ">$file";

								    print ARQ $response->content;

								    close ARQ;

								}


								sub getList {

								    my $query = shift;

								    my $start = shift;


								    $query = $cgi->url_encode($query);


								    my $url = 'http://www.google.com.br/search?q='.$query.'+site:'.$siteurl.'&num=100&hl=pt-BR&lr=&as_qdr=all&filter=0';

								    if ($start) {

								    $url .= '&sa=N&start='.$start;

								    }


								    my $response = $ua->get($url);


								    my $content;

								    if ($response->is_success) {

								    $content = $response->content;  # or whatever

								    } else {

								    return 0;

								    }


								    my @links;

								    while (my ($siteUrl) = $content =~ m|href=\"(http://[0-9.]+/search\?q=cache[^\"]+)\"|) {

								    push @links, $siteUrl;

								    $content =~ s|href=\"(http://[0-9.]+/search\?q=cache[^\"]+)\"||;

								    }


								    return @links;

								}