ocrMime = self::OCR_MIME_NATIVE; if (is_callable('imagepng')) { $this->ocrMime = array_merge(self::OCR_MIME_CONVERT, $this->ocrMime); } exec($prefs['ocr_pdfimages_path'] . ' -v', $output, $return); if ($return === 0) { $this->ocrMime = array_merge(self::PDF_MIME, $this->ocrMime); } } /** * Produces the absolute file path of any command. Unix and Windows safe. * @param $executable string The file name you want to find the absolute path of * * @return string|null The absolute file path or null on no command found * @throws Exception If no suitable command was found * todo Find the correct exit code on Windows if the "where" does not find the command. */ public function whereIsExecutable(string $executable): ?string { if (! is_callable('exec')) { throw new Exception('exec() is not enabled. Could not execute command.'); } $executable = escapeshellarg($executable); $possibleCommands = [ 'type -p ' . $executable . ' 2>&1', 'where ' . $executable . ' 2>&1', 'which ' . $executable . ' 2>&1' ]; foreach ($possibleCommands as $cmd) { $output = $return = null; exec($cmd, $output, $return); if ($return === 0) { return array_shift($output); } } return null; } /** * Checks if a file id can be processed or not. * * @throws Exception If the file is not suitable to be OCR'd, throw an exception */ public function checkFileGalID() { if (! $this->table('tiki_files')->fetchBool(['fileId' => $this->nextOCRFile])) { throw new Exception('The File ID specified does not exist.'); } } /** * Checks if all the dependencies for OCR have been satisfied. * * @throws Exception if one of the dependencies are not satisfied; */ public function checkOCRDependencies() { global $prefs; if ($prefs['ocr_enable'] !== 'y') { throw new Exception('Feature Disabled'); } if (! class_exists('thiagoalessio\TesseractOCR\TesseractOCR')) { throw new Exception('Tesseract not installed in Packages.'); } if (! $this->checkTesseractVersion()) { throw new Exception('Tesseract binary not found.'); } } /** * Check if Tesseract binary is installed. * * @return bool false if Tesseract not installed or true otherwise */ private function checkTesseractInstalled(): bool { if (! class_exists('thiagoalessio\TesseractOCR\TesseractOCR')) { return false; } try { $tesseract = $this->newTesseract(); $errors = new FriendlyErrors(); $errors::checkTesseractPresence($tesseract->command->executable); } catch (Exception $e) { return false; } return true; } /** * Gets the binary tesseract version. * * @return string version number upon success, or empty string otherwise. */ public function getTesseractVersion(): string { if (! class_exists('thiagoalessio\TesseractOCR\TesseractOCR')) { return ''; } if ($this->checkTesseractInstalled()) { $tesseract = $this->newTesseract(); return $tesseract->command->getTesseractVersion(); } return ''; } /** * Checks if the binary tesseract version is sufficient. * * @return bool True if version is sufficient, false otherwise */ public function checkTesseractVersion(): bool { return version_compare($this->getTesseractVersion(), self::TESSERACT_BINARY_VERSION, '>='); } /** * @return array 3 character language codes installed with Tesseract Binary */ public function getTesseractLangs(): array { if (! class_exists('thiagoalessio\TesseractOCR\TesseractOCR')) { return []; } if (! $this->checkTesseractInstalled()) { return []; } $tesseract = $this->newTesseract(); return $tesseract->command->getAvailableLanguages(); } /** * Change processing flags back to pending. * * @return int Number of files changed from processing to pending. */ public function releaseAllProcessing(): int { $changes = $this->table('tiki_files')->updateMultiple( ['ocr_state' => self::OCR_STATUS_PENDING], ['ocr_state' => self::OCR_STATUS_PROCESSING] ); return $changes->numrows; } /** * Change stalled flags back to pending. * * @return int Number of files changed from stalled to pending. */ public function releaseAllStalled(): int { $changes = $this->table('tiki_files')->updateMultiple( ['ocr_state' => self::OCR_STATUS_PENDING], ['ocr_state' => self::OCR_STATUS_STALLED] ); return $changes->numrows; } /** * Set $nextOCRFile with the fileId of the next file scheduled to be processed by the OCR engine. */ public function setNextOCRFile() { $db = $this->table('tiki_files'); $conditions = ['ocr_state' => self::OCR_STATUS_PENDING]; if ($this->nextOCRFile) { // we always take a greater file id to avoid infinite loops $conditions['fileId'] = $db->GreaterThan($this->nextOCRFile); } $this->nextOCRFile = $db->fetchOne('fileId', $conditions, ['fileId' => 'ASC']); } /** * Creates a new tesseract instance. * * @param null|string $fileName File path of file to OCR. Null if no file. * * @return TesseractOCR A instance with all Tiki preferences applied. */ private function newTesseract(?string $fileName = null) { global $prefs; $tesseract = new TesseractOCR($fileName); if (! empty($prefs['ocr_tesseract_path'])) { $tesseract->executable($prefs['ocr_tesseract_path']); } return $tesseract; } /** * Finds the languages that a file will/has been processed with. * * @param null|int $fileId null defaults to the current file being worked on, otherwise it uses the passed fileid. * * @return array List of file specific languages */ public function listFileLanguages(?int $fileId = null): array { global $prefs; if (! $fileId) { $fileId = $this->ocrIngNow; } $db = $this->table('tiki_files'); // first set file level languages if they exist if (! empty($prefs['ocr_file_level']) && $prefs['ocr_file_level'] === 'y') { $langs = json_decode($this->table('tiki_files')->fetchOne('ocr_lang', ['fileId' => $fileId])); } // if no file level languages we look for gallery level language preferences if (empty($langs)) { $galId = $db->fetchOne('galleryId', ['fileId' => $fileId]); $db = $this->table('tiki_file_galleries'); $langs = json_decode($db->fetchOne('ocr_lang', ['galleryId' => $galId])); // if gallery does not have preferences, we take a look at the master gallery for direction. if (empty($langs && $galId !== 1)) { $langs = json_decode($db->fetchOne('ocr_lang', ['galleryId' => 1])); } } if (empty($langs) && ! empty($prefs['ocr_limit_languages'])) { $langs = $prefs['ocr_limit_languages']; } // we fall back on Auto Detect if there are no preferences set if (empty($langs)) { $langs[] = 'osd'; } return $langs; } /** * * OCR's a file set by $ocrIngNow. Intended to be used by a CLI command, as OCRing a large file may cause timeouts. * * @return string Message detailing action performed. * @throws Exception If a problem occurs while processing a file */ public function OCRfile() { if (! $this->nextOCRFile) { throw new Exception('No files to OCR'); } // Set the database state to reflect that the next file in the queue has begun $this->table('tiki_files')->update( ['ocr_state' => self::OCR_STATUS_PROCESSING], ['fileId' => $this->nextOCRFile] ); $this->setNextOCRFile(); // Sets $ocrIngNow with the current file flagged as currently being processed. $this->ocrIngNow = $this->table('tiki_files')->fetchOne( 'fileId', ['ocr_state' => self::OCR_STATUS_PROCESSING] ); $file = TikiLib::lib('filegal')->get_file($this->ocrIngNow); try { if ($file['data']) { /** @var tempFile string The file path of a temp file for processing */ $tempFile = writeTempFile($file['data']); } else { global $prefs; $directory = $prefs['fgal_use_dir']; // lets make sure there is a slash following the directory name if (substr($directory, -1) !== '/') { $directory = $directory . '/'; } $fileContent = @file_get_contents($directory . $file['path']); if ($fileContent === false) { throw new Exception('Reading ' . $file['path'] . ' failed'); } $tempFile = writeTempFile($fileContent); unset($fileContent); } // now that we have a temp file written to file, lets start processing it $filesystem = new Filesystem(); if (in_array($file['filetype'], self::OCR_MIME_CONVERT)) { /** @var fileName string The path that the file can be read on the server in a format readable to Tesseract. */ $fileName = writeTempFile(''); unlink($fileName); if (! is_callable('imagepng')) { throw new Exception('Install GD to convert.'); } imagepng(imagecreatefromstring(file_get_contents($tempFile)), $fileName); } elseif (in_array($file['filetype'], self::OCR_MIME_NATIVE)) { $fileName = $tempFile; $tempFile = null; // we zero this out so the file is not deleted later. } elseif (in_array($file['filetype'], self::PDF_MIME)) { Tikilib::lib('pdfimages'); $image = new PdfImagesLib(); $image->setBinaryPath(); $image->setArgument('tiff'); $fileName = writeTempFile(null, 'random'); // in this case we create a directory for writing files to. $image->setFilePaths($tempFile, $fileName); $image->run(); unset($image); } else { // fall back onto media alchemist if the file type is not otherwise convertible. if (! class_exists('MediaAlchemyst\Alchemyst')) { throw new Exception('Install Media Alchemist to convert.'); } $alchemy = new Alchemy\AlchemyLib(); // We create a empty temp file and then delete it, so we know its writable before passing to alchemy $fileName = writeTempFile(''); unlink($fileName); if ($alchemy->convertToImage($tempFile, $fileName) === null) { throw new Exception('Media Alchemist unable to convert file'); } } @$filesystem->remove($tempFile); // now that we are done with the temp file, lets delete it. $langs = $this->listFileLanguages(); if (is_dir($fileName)) { $OCRText = ''; foreach (glob($fileName . '*.tif') as $tiffFile) { $OCRText .= ($this->newTesseract($tiffFile))->lang(...$langs)->run(); } } else { $OCRText = ($this->newTesseract($fileName))->lang(...$langs)->run(); } $OCRText = TikiFilter::get('striptags')->filter($OCRText); $this->table('tiki_files')->update( ['ocr_data' => $OCRText], ['fileId' => $this->ocrIngNow] ); $unifiedsearchlib = TikiLib::lib('unifiedsearch'); $unifiedsearchlib->invalidateObject('file', $this->ocrIngNow); $unifiedsearchlib->processUpdateQueue(); // change the ocr state from processing to finished OCR'ing $this->ocrIngNow = $this->table('tiki_files')->update( ['ocr_state' => self::OCR_STATUS_FINISHED], ['fileId' => $this->ocrIngNow] ); } catch (Exception $e) { @$filesystem->remove($fileName); @$filesystem->remove($tempFile); // Set the database flag to reflect that it is no longer processing but, still needs to be OCR'd $this->table('tiki_files')->update( ['ocr_state' => self::OCR_STATUS_STALLED], ['fileId' => $this->ocrIngNow] ); throw new Exception($e->getMessage()); } // if we had to create temp files, lets remove them. @$filesystem->remove($fileName); } }