creates an OCR datastream at the book level.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

200 lines
5.5 KiB

4 years ago
<?php
/**
* @file
* Module file for roblib_update_book_ocr.
*/
function roblib_update_book_ocr_islandora_paged_content_pages_management_tabs_alter(&$manage_tabs, $context) {
$manage_tabs['manage_pages']['book_ocr'] = array(
'#access' => islandora_object_access(ISLANDORA_ADD_DS, $context['object']),
'#title' => t('Create or Update Book OCR'),
'#type' => 'fieldset',
'form' => drupal_get_form('roblib_update_book_ocr_manage_pages_ocr_form', $context['object']),
'#collapsible' => TRUE,
'#collapsed' => TRUE,
);
}
/**
* Derives the OCR datastreams in each child page.
*
* @param array $form
* The Drupal form.
* @param array $form_state
* The Drupal form state.
* @param AbstractObject $object
* The object to fetch the pages from.
*
* @return array
* The Drupal form.
*/
function roblib_update_book_ocr_manage_pages_ocr_form(array $form, array &$form_state, AbstractObject $object) {
$form_state['object'] = $object;
return array(
'description' => array(
'#type' => 'item',
'#description' => t('Aggregates all the page level OCR into one OCR datastream at the book level.<br/>
This will not generate OCR/HOCR at the page level. The page level OCR must already exist.'),
),
'submit' => array(
'#disabled' => FALSE,
'#type' => 'submit',
'#value' => t('Create/Update Book OCR'),
),
);
}
/**
* Triggers a batch to derive the OCR datastreams in each page object.
*
* @param array $form
* The Drupal form.
* @param array $form_state
* The Drupal form state.
*/
function roblib_update_book_ocr_manage_pages_ocr_form_submit(array $form, array &$form_state) {
$object = $form_state['object'];
$pages = array_keys(islandora_paged_content_get_pages($object));
roblib_update_book_ocr_batch($pages, $object);
}
/**
* Setup the batch.
*
* @param Array $pages
* A list of pages related to the book
* @param string $pid
* The PID of the book.
*/
function roblib_update_book_ocr_batch($pages, $object) {
$batch = [
'title' => t('Updating/Creating Book OCR ...'),
'operations' => [],
'init_message' => t('starting'),
'progress_message' => t('Processed @current out of @total.'),
'error_message' => t('An error occurred during processing'),
'finished' => 'roblib_update_book_ocr_batch_finished',
];
$total = count($pages);
foreach ($pages as $page) {
$batch['operations'][] = [
'_roblib_update_book_ocr_update_book',
[$page, $total, $object->id],
];
}
batch_set($batch);
batch_process('islandora/object/' . $object->id);
}
/**
* Handles individual requests from the batch.
*
* @param array $page
* A page from a book.
* @param array $context
* The batch context.
*/
function _roblib_update_book_ocr_update_book($page, $total, $parent_pid, &$context) {
if (!isset($context['results']['pages_processed'])) {
$context['results']['pages_processed'] = 1;
$context['results']['ocr'] = '';
}
$context['results']['pages_processed']++;
if (!isset($context['sandbox']['total'])) {
$context['sandbox']['total'] = $total;
}
if (!isset($context['sandbox']['parent'])){
$context['results']['parent'] = $parent_pid;
}
$context['message'] = t('Retrieving OCR for page !p %pid', [
'!p' => $context['results']['pages_processed'],
'%pid' => $page,
]);
try {
// keep as string for now, ocr should be in kilobytes.
$context['results']['ocr'] .= roblib_update_book_ocr_get_ocr($page);
} catch (Exception $e) {
//TODO something
$context['message'] = t('Error retrieving OCR for page !p %pid', [
'!p' => $context['results']['pages_processed'],
'%pid' => $page,
]);
watchdog('roblib_update_book_ocr', $e->getMessage(), NULL, WATCHDOG_ERROR, NULL);
}
if ( $context['results']['pages_processed'] != $context['sandbox']['total']) {
$context['finished'] = $context['results']['pages_processed'] / $context['sandbox']['total'];
}
}
function roblib_update_book_ocr($parent_id, $ocr) {
$dsid = 'OCR';
$parent_object = islandora_object_load($parent_id);
if (!isset($parent_object[$dsid])) {
$datastream = $parent_object->constructDatastream($dsid);
$datastream->label = $dsid;
$datastream->mimeType = 'text/plain';
$datastream->content = $ocr;
}
else {
$datastream = $parent_object[$dsid];
$datastream->content = $ocr;
}
if (!isset($parent_object[$dsid])) {
$parent_object->ingestDatastream($datastream);
}
unset($parent_object);
}
/**
* Prepare a page object for update.
*
* @param array $page
* A page from a book.
*
*
* @throws \Exception
*/
function roblib_update_book_ocr_get_ocr($page) {
$dsid = 'OCR';
$page_object = islandora_object_load($page);
if (!isset($page_object[$dsid])) {
$ocr = '';
watchdog('roblib_update_book_ocr', t('Object %page did not contain an OCR datastream', array('%page' => $page)), NULL, WATCHDOG_INFO, NULL);
}
else {
$ocr = $page_object[$dsid]->content;
}
unset($page_object);
return $ocr;
}
/**
* @param array $success
* @param array $results
* @param array $operations
*
* @return array|mixed|string|null
*/
function roblib_update_book_ocr_batch_finished($success, $results, $operations) {
roblib_update_book_ocr($results['parent'], $results['ocr']);
$message = t('Finished Processing OCR for the page objects, processed %total pages', array(
'%total' => $results['pages_processed'] - 1,
));
drupal_set_message($message, 'info');
return t('Finished Processing Pages.');
}