You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
200 lines
5.5 KiB
200 lines
5.5 KiB
4 years ago
|
<?php
|
||
|
|
||
|
/**
|
||
|
* @file
|
||
|
* Module file for roblib_update_book_ocr.
|
||
|
*/
|
||
|
|
||
|
function roblib_update_book_ocr_islandora_paged_content_pages_management_tabs_alter(&$manage_tabs, $context) {
|
||
|
$manage_tabs['manage_pages']['book_ocr'] = array(
|
||
|
'#access' => islandora_object_access(ISLANDORA_ADD_DS, $context['object']),
|
||
|
'#title' => t('Create or Update Book OCR'),
|
||
|
'#type' => 'fieldset',
|
||
|
'form' => drupal_get_form('roblib_update_book_ocr_manage_pages_ocr_form', $context['object']),
|
||
|
'#collapsible' => TRUE,
|
||
|
'#collapsed' => TRUE,
|
||
|
);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Derives the OCR datastreams in each child page.
|
||
|
*
|
||
|
* @param array $form
|
||
|
* The Drupal form.
|
||
|
* @param array $form_state
|
||
|
* The Drupal form state.
|
||
|
* @param AbstractObject $object
|
||
|
* The object to fetch the pages from.
|
||
|
*
|
||
|
* @return array
|
||
|
* The Drupal form.
|
||
|
*/
|
||
|
function roblib_update_book_ocr_manage_pages_ocr_form(array $form, array &$form_state, AbstractObject $object) {
|
||
|
$form_state['object'] = $object;
|
||
|
return array(
|
||
|
'description' => array(
|
||
|
'#type' => 'item',
|
||
|
'#description' => t('Aggregates all the page level OCR into one OCR datastream at the book level.<br/>
|
||
|
This will not generate OCR/HOCR at the page level. The page level OCR must already exist.'),
|
||
|
),
|
||
|
|
||
|
'submit' => array(
|
||
|
'#disabled' => FALSE,
|
||
|
'#type' => 'submit',
|
||
|
'#value' => t('Create/Update Book OCR'),
|
||
|
),
|
||
|
);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Triggers a batch to derive the OCR datastreams in each page object.
|
||
|
*
|
||
|
* @param array $form
|
||
|
* The Drupal form.
|
||
|
* @param array $form_state
|
||
|
* The Drupal form state.
|
||
|
*/
|
||
|
function roblib_update_book_ocr_manage_pages_ocr_form_submit(array $form, array &$form_state) {
|
||
|
$object = $form_state['object'];
|
||
|
$pages = array_keys(islandora_paged_content_get_pages($object));
|
||
|
roblib_update_book_ocr_batch($pages, $object);
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* Setup the batch.
|
||
|
*
|
||
|
* @param Array $pages
|
||
|
* A list of pages related to the book
|
||
|
* @param string $pid
|
||
|
* The PID of the book.
|
||
|
*/
|
||
|
function roblib_update_book_ocr_batch($pages, $object) {
|
||
|
$batch = [
|
||
|
'title' => t('Updating/Creating Book OCR ...'),
|
||
|
'operations' => [],
|
||
|
'init_message' => t('starting'),
|
||
|
'progress_message' => t('Processed @current out of @total.'),
|
||
|
'error_message' => t('An error occurred during processing'),
|
||
|
'finished' => 'roblib_update_book_ocr_batch_finished',
|
||
|
];
|
||
|
$total = count($pages);
|
||
|
foreach ($pages as $page) {
|
||
|
$batch['operations'][] = [
|
||
|
'_roblib_update_book_ocr_update_book',
|
||
|
[$page, $total, $object->id],
|
||
|
];
|
||
|
}
|
||
|
batch_set($batch);
|
||
|
batch_process('islandora/object/' . $object->id);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Handles individual requests from the batch.
|
||
|
*
|
||
|
* @param array $page
|
||
|
* A page from a book.
|
||
|
* @param array $context
|
||
|
* The batch context.
|
||
|
*/
|
||
|
function _roblib_update_book_ocr_update_book($page, $total, $parent_pid, &$context) {
|
||
|
if (!isset($context['results']['pages_processed'])) {
|
||
|
$context['results']['pages_processed'] = 1;
|
||
|
$context['results']['ocr'] = '';
|
||
|
}
|
||
|
|
||
|
$context['results']['pages_processed']++;
|
||
|
|
||
|
if (!isset($context['sandbox']['total'])) {
|
||
|
$context['sandbox']['total'] = $total;
|
||
|
}
|
||
|
if (!isset($context['sandbox']['parent'])){
|
||
|
$context['results']['parent'] = $parent_pid;
|
||
|
}
|
||
|
|
||
|
$context['message'] = t('Retrieving OCR for page !p %pid', [
|
||
|
'!p' => $context['results']['pages_processed'],
|
||
|
'%pid' => $page,
|
||
|
]);
|
||
|
try {
|
||
|
// keep as string for now, ocr should be in kilobytes.
|
||
|
$context['results']['ocr'] .= roblib_update_book_ocr_get_ocr($page);
|
||
|
} catch (Exception $e) {
|
||
|
//TODO something
|
||
|
$context['message'] = t('Error retrieving OCR for page !p %pid', [
|
||
|
'!p' => $context['results']['pages_processed'],
|
||
|
'%pid' => $page,
|
||
|
]);
|
||
|
watchdog('roblib_update_book_ocr', $e->getMessage(), NULL, WATCHDOG_ERROR, NULL);
|
||
|
}
|
||
|
|
||
|
if ( $context['results']['pages_processed'] != $context['sandbox']['total']) {
|
||
|
$context['finished'] = $context['results']['pages_processed'] / $context['sandbox']['total'];
|
||
|
}
|
||
|
|
||
|
|
||
|
}
|
||
|
|
||
|
function roblib_update_book_ocr($parent_id, $ocr) {
|
||
|
$dsid = 'OCR';
|
||
|
$parent_object = islandora_object_load($parent_id);
|
||
|
if (!isset($parent_object[$dsid])) {
|
||
|
$datastream = $parent_object->constructDatastream($dsid);
|
||
|
$datastream->label = $dsid;
|
||
|
$datastream->mimeType = 'text/plain';
|
||
|
$datastream->content = $ocr;
|
||
|
|
||
|
}
|
||
|
else {
|
||
|
$datastream = $parent_object[$dsid];
|
||
|
$datastream->content = $ocr;
|
||
|
}
|
||
|
|
||
|
if (!isset($parent_object[$dsid])) {
|
||
|
$parent_object->ingestDatastream($datastream);
|
||
|
}
|
||
|
unset($parent_object);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Prepare a page object for update.
|
||
|
*
|
||
|
* @param array $page
|
||
|
* A page from a book.
|
||
|
*
|
||
|
*
|
||
|
* @throws \Exception
|
||
|
*/
|
||
|
function roblib_update_book_ocr_get_ocr($page) {
|
||
|
$dsid = 'OCR';
|
||
|
$page_object = islandora_object_load($page);
|
||
|
if (!isset($page_object[$dsid])) {
|
||
|
$ocr = '';
|
||
|
watchdog('roblib_update_book_ocr', t('Object %page did not contain an OCR datastream', array('%page' => $page)), NULL, WATCHDOG_INFO, NULL);
|
||
|
}
|
||
|
else {
|
||
|
$ocr = $page_object[$dsid]->content;
|
||
|
}
|
||
|
|
||
|
unset($page_object);
|
||
|
return $ocr;
|
||
|
}
|
||
|
|
||
|
|
||
|
/**
|
||
|
* @param array $success
|
||
|
* @param array $results
|
||
|
* @param array $operations
|
||
|
*
|
||
|
* @return array|mixed|string|null
|
||
|
*/
|
||
|
function roblib_update_book_ocr_batch_finished($success, $results, $operations) {
|
||
|
roblib_update_book_ocr($results['parent'], $results['ocr']);
|
||
|
$message = t('Finished Processing OCR for the page objects, processed %total pages', array(
|
||
|
'%total' => $results['pages_processed'] - 1,
|
||
|
));
|
||
|
drupal_set_message($message, 'info');
|
||
|
return t('Finished Processing Pages.');
|
||
|
|
||
|
}
|