You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
199 lines
5.5 KiB
199 lines
5.5 KiB
<?php |
|
|
|
/** |
|
* @file |
|
* Module file for roblib_update_book_ocr. |
|
*/ |
|
|
|
function roblib_update_book_ocr_islandora_paged_content_pages_management_tabs_alter(&$manage_tabs, $context) { |
|
$manage_tabs['manage_pages']['book_ocr'] = array( |
|
'#access' => islandora_object_access(ISLANDORA_ADD_DS, $context['object']), |
|
'#title' => t('Create or Update Book OCR'), |
|
'#type' => 'fieldset', |
|
'form' => drupal_get_form('roblib_update_book_ocr_manage_pages_ocr_form', $context['object']), |
|
'#collapsible' => TRUE, |
|
'#collapsed' => TRUE, |
|
); |
|
} |
|
|
|
/** |
|
* Derives the OCR datastreams in each child page. |
|
* |
|
* @param array $form |
|
* The Drupal form. |
|
* @param array $form_state |
|
* The Drupal form state. |
|
* @param AbstractObject $object |
|
* The object to fetch the pages from. |
|
* |
|
* @return array |
|
* The Drupal form. |
|
*/ |
|
function roblib_update_book_ocr_manage_pages_ocr_form(array $form, array &$form_state, AbstractObject $object) { |
|
$form_state['object'] = $object; |
|
return array( |
|
'description' => array( |
|
'#type' => 'item', |
|
'#description' => t('Aggregates all the page level OCR into one OCR datastream at the book level.<br/> |
|
This will not generate OCR/HOCR at the page level. The page level OCR must already exist.'), |
|
), |
|
|
|
'submit' => array( |
|
'#disabled' => FALSE, |
|
'#type' => 'submit', |
|
'#value' => t('Create/Update Book OCR'), |
|
), |
|
); |
|
} |
|
|
|
/** |
|
* Triggers a batch to derive the OCR datastreams in each page object. |
|
* |
|
* @param array $form |
|
* The Drupal form. |
|
* @param array $form_state |
|
* The Drupal form state. |
|
*/ |
|
function roblib_update_book_ocr_manage_pages_ocr_form_submit(array $form, array &$form_state) { |
|
$object = $form_state['object']; |
|
$pages = array_keys(islandora_paged_content_get_pages($object)); |
|
roblib_update_book_ocr_batch($pages, $object); |
|
} |
|
|
|
|
|
/** |
|
* Setup the batch. |
|
* |
|
* @param Array $pages |
|
* A list of pages related to the book |
|
* @param string $pid |
|
* The PID of the book. |
|
*/ |
|
function roblib_update_book_ocr_batch($pages, $object) { |
|
$batch = [ |
|
'title' => t('Updating/Creating Book OCR ...'), |
|
'operations' => [], |
|
'init_message' => t('starting'), |
|
'progress_message' => t('Processed @current out of @total.'), |
|
'error_message' => t('An error occurred during processing'), |
|
'finished' => 'roblib_update_book_ocr_batch_finished', |
|
]; |
|
$total = count($pages); |
|
foreach ($pages as $page) { |
|
$batch['operations'][] = [ |
|
'_roblib_update_book_ocr_update_book', |
|
[$page, $total, $object->id], |
|
]; |
|
} |
|
batch_set($batch); |
|
batch_process('islandora/object/' . $object->id); |
|
} |
|
|
|
/** |
|
* Handles individual requests from the batch. |
|
* |
|
* @param array $page |
|
* A page from a book. |
|
* @param array $context |
|
* The batch context. |
|
*/ |
|
function _roblib_update_book_ocr_update_book($page, $total, $parent_pid, &$context) { |
|
if (!isset($context['results']['pages_processed'])) { |
|
$context['results']['pages_processed'] = 1; |
|
$context['results']['ocr'] = ''; |
|
} |
|
|
|
$context['results']['pages_processed']++; |
|
|
|
if (!isset($context['sandbox']['total'])) { |
|
$context['sandbox']['total'] = $total; |
|
} |
|
if (!isset($context['sandbox']['parent'])){ |
|
$context['results']['parent'] = $parent_pid; |
|
} |
|
|
|
$context['message'] = t('Retrieving OCR for page !p %pid', [ |
|
'!p' => $context['results']['pages_processed'], |
|
'%pid' => $page, |
|
]); |
|
try { |
|
// keep as string for now, ocr should be in kilobytes. |
|
$context['results']['ocr'] .= roblib_update_book_ocr_get_ocr($page); |
|
} catch (Exception $e) { |
|
//TODO something |
|
$context['message'] = t('Error retrieving OCR for page !p %pid', [ |
|
'!p' => $context['results']['pages_processed'], |
|
'%pid' => $page, |
|
]); |
|
watchdog('roblib_update_book_ocr', $e->getMessage(), NULL, WATCHDOG_ERROR, NULL); |
|
} |
|
|
|
if ( $context['results']['pages_processed'] != $context['sandbox']['total']) { |
|
$context['finished'] = $context['results']['pages_processed'] / $context['sandbox']['total']; |
|
} |
|
|
|
|
|
} |
|
|
|
function roblib_update_book_ocr($parent_id, $ocr) { |
|
$dsid = 'OCR'; |
|
$parent_object = islandora_object_load($parent_id); |
|
if (!isset($parent_object[$dsid])) { |
|
$datastream = $parent_object->constructDatastream($dsid); |
|
$datastream->label = $dsid; |
|
$datastream->mimeType = 'text/plain'; |
|
$datastream->content = $ocr; |
|
|
|
} |
|
else { |
|
$datastream = $parent_object[$dsid]; |
|
$datastream->content = $ocr; |
|
} |
|
|
|
if (!isset($parent_object[$dsid])) { |
|
$parent_object->ingestDatastream($datastream); |
|
} |
|
unset($parent_object); |
|
} |
|
|
|
/** |
|
* Prepare a page object for update. |
|
* |
|
* @param array $page |
|
* A page from a book. |
|
* |
|
* |
|
* @throws \Exception |
|
*/ |
|
function roblib_update_book_ocr_get_ocr($page) { |
|
$dsid = 'OCR'; |
|
$page_object = islandora_object_load($page); |
|
if (!isset($page_object[$dsid])) { |
|
$ocr = ''; |
|
watchdog('roblib_update_book_ocr', t('Object %page did not contain an OCR datastream', array('%page' => $page)), NULL, WATCHDOG_INFO, NULL); |
|
} |
|
else { |
|
$ocr = $page_object[$dsid]->content; |
|
} |
|
|
|
unset($page_object); |
|
return $ocr; |
|
} |
|
|
|
|
|
/** |
|
* @param array $success |
|
* @param array $results |
|
* @param array $operations |
|
* |
|
* @return array|mixed|string|null |
|
*/ |
|
function roblib_update_book_ocr_batch_finished($success, $results, $operations) { |
|
roblib_update_book_ocr($results['parent'], $results['ocr']); |
|
$message = t('Finished Processing OCR for the page objects, processed %total pages', array( |
|
'%total' => $results['pages_processed'] - 1, |
|
)); |
|
drupal_set_message($message, 'info'); |
|
return t('Finished Processing Pages.'); |
|
|
|
}
|
|
|