You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

106 lines
3.2 KiB

<?php
// (c) Copyright by authors of the Tiki Wiki CMS Groupware Project
//
// All Rights Reserved. See copyright.txt for details and a complete list of authors.
// Licensed under the GNU LESSER GENERAL PUBLIC LICENSE. See license.txt for details.
// $Id$
namespace Tiki\File;
use Tiki\TikiInit;
use Tiki\Process\Process;
use Tiki\FileGallery\File;
use Tiki\Lib\Alchemy\AlchemyLib;
use Tiki\Lib\Unoconv\UnoconvLib;
use Unoconv\Exception\RuntimeException;
class OcrHelper
{
/**
* Extract text from multiple file types for OCR purposes
* @param $fileId
* @return bool|string
* @throws \Exception
*/
public static function extractText($fileId)
{
$file = File::id($fileId);
if (empty($file)) {
return false;
}
if (substr(PHP_OS, 0, 3) == 'WIN') {
return false;
}
global $mimetypes, $tikidomain;
include_once('lib/mime/mimetypes.php');
$tempReadableFilePath = $file->getWrapper()->getReadableFile();
$pdfTempFile = null;
$tempFileName = implode(DIRECTORY_SEPARATOR, ['temp', 'cache', $tikidomain, uniqid()]);
$mimetypeMatched = false;
if (FileHelper::isOfficeDocument($file->filetype)) {
$mimetypeMatched = true;
try {
$unoconv = new UnoconvLib();
$unoconv->convertFile($tempReadableFilePath, $tempFileName, 'txt');
} catch (RuntimeException $e) {
// Unoconv was unable to extract text without converting to PDF first
$pdfTempFile = PDFHelper::convertToPDF($fileId);
}
}
if ($file->filetype == $mimetypes['pdf'] || (! empty($pdfTempFile) && file_exists($pdfTempFile))) {
$mimetypeMatched = true;
$tempReadableFilePath = $pdfTempFile ?: $tempReadableFilePath;
$process = new Process(['which', 'pdftotext']);
$process->run();
$pdfToTextPath = preg_replace('/\s+/', ' ', trim($process->getOutput()));
if (empty($pdfToTextPath) || TikiInit::isWindows()) {
throw new \Exception('Text cannot be extracted because pdftotext library is unavailable or is not supported.');
}
$process = new Process([$pdfToTextPath, $tempReadableFilePath, $tempFileName]);
$process->run();
}
if (! $mimetypeMatched) {
throw new \Exception('Mime-type not supported for OCR text extraction');
}
if ($pdfTempFile) {
unlink($pdfTempFile);
}
$ocrContent = file_get_contents($tempFileName);
unlink($tempFileName);
return $ocrContent;
}
/**
* Convert an image to another image file format
* @param $fileId
* @param $newPathWithExtension
* @return bool|string|null
* @TODO - This function should probably be moved into a more generic place.
*/
public static function convertImage($fileId, $newPathWithExtension)
{
$file = File::id($fileId);
if (empty($file)) {
return false;
}
$alchemy = new AlchemyLib();
$temporaryFile = $file->getWrapper()->getReadableFile();
return $alchemy->convertToImage($temporaryFile, $newPathWithExtension);
}
}