<?php

// (c) Copyright by authors of the Tiki Wiki CMS Groupware Project
//
// All Rights Reserved. See copyright.txt for details and a complete list of authors.
// Licensed under the GNU LESSER GENERAL PUBLIC LICENSE. See license.txt for details.
// $Id$


class PageContentLib
{
    public function augmentInformation($data)
    {
        global $prefs;

        if ($prefs['page_content_fetch'] == 'y') {
            $new = $this->grabContent($data['url']);
            if ($new) {
                $data['content'] = $new['content'];
            }
        }

        return $data;
    }

    public function grabContent($url)
    {
        $tikilib = TikiLib::lib('tiki');

        $client = $tikilib->get_http_client($url);
        $response = $tikilib->http_perform_request($client);

        // Obtain the URL after redirections
        $url = (string) $client->getUri();
        $html = $response->getBody();

        // Note: PHP Readability expects UTF-8 encoded content.
        // If your content is not UTF-8 encoded, convert it
        // first before passing it to PHP Readability.
        // Both iconv() and mb_convert_encoding() can do this.

        // If we've got Tidy, let's clean up input.
        // This step is highly recommended - PHP's default HTML parser
        // often doesn't do a great job and results in strange output.
        $html = $this->tidy($html);

        // give it to Readability
        global $prefs;
        if (! class_exists('Readability\Readability')) {
            return false;
        }

        $readability = new Readability\Readability($html, $url);

        $result = $readability->init();

        if ($result) {
            $content = $this->tidy($readability->getContent()->innerHTML);
            $content = $this->replacePaths($content, $url);
            return [
                'title' => $readability->getTitle()->textContent,
                'content' => $content,
            ];
        }
    }

    private function tidy($html)
    {
        if (function_exists('tidy_parse_string')) {
            $tidy = tidy_parse_string($html, [], 'UTF8');
            $tidy->cleanRepair();
            $html = $tidy->value;
        }

        return $html;
    }

    private function getUrls($url)
    {
        // From http://stackoverflow.com/questions/21201062/using-readability-api-to-scrape-most-relavant-image-from-page

        // Parse URL
        $urlArr = parse_url($url);

        // Determine Base URL, with scheme, host, and port
        $base = $urlArr['scheme'] . "://" . $urlArr['host'];
        if (array_key_exists("port", $urlArr) && $urlArr['port'] != 80) {
            $base .= ":" . $urlArr['port'];
        }

        // Truncate the Path using the position of the last forward slash
        $relative = $base . substr($urlArr['path'], 0, strrpos($urlArr['path'], "/") + 1);

        // Return our two URLs
        return [$base, $relative];
    }

    public function replacePaths($html, $url)
    {
        // Modified from: http://stackoverflow.com/questions/21201062/using-readability-api-to-scrape-most-relavant-image-from-page

        // Retrieve our URLs
        list($baseUrl, $relativeUrl) = $this->getUrls($url);

        $convert = function ($url) use ($baseUrl, $relativeUrl) {
            // Resolve relative paths
            if (substr($url, 0, 2) == "//") { // Missing protocol
                // Fine, use current
            } elseif (substr($url, 0, 1) == "/") { // Path Relative to Base
                $url = $baseUrl . $url;
            } elseif (substr($url, 0, 4) !== "http") { // Path Relative to Dimension
                $url = $relativeUrl . $url;
            }

            return $url;
        };

        libxml_use_internal_errors(true);

        $dom = new DOMDocument();
        $dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));

        foreach ($dom->getElementsByTagName('img') as $node) {
            $image = $node->getAttribute('src');

            $node->setAttribute('src', $convert($image));
        }

        foreach ($dom->getElementsByTagName('a') as $node) {
            $link = $node->getAttribute('href');

            $node->setAttribute('href', $convert($link));
        }

        return $dom->saveHTML();
    }
}