islandora_object_access(ISLANDORA_ADD_DS, $context['object']), '#title' => t('Create or Update Book OCR'), '#type' => 'fieldset', 'form' => drupal_get_form('roblib_update_book_ocr_manage_pages_ocr_form', $context['object']), '#collapsible' => TRUE, '#collapsed' => TRUE, ); } /** * Derives the OCR datastreams in each child page. * * @param array $form * The Drupal form. * @param array $form_state * The Drupal form state. * @param AbstractObject $object * The object to fetch the pages from. * * @return array * The Drupal form. */ function roblib_update_book_ocr_manage_pages_ocr_form(array $form, array &$form_state, AbstractObject $object) { $form_state['object'] = $object; return array( 'description' => array( '#type' => 'item', '#description' => t('Aggregates all the page level OCR into one OCR datastream at the book level.
This will not generate OCR/HOCR at the page level. The page level OCR must already exist.'), ), 'submit' => array( '#disabled' => FALSE, '#type' => 'submit', '#value' => t('Create/Update Book OCR'), ), ); } /** * Triggers a batch to derive the OCR datastreams in each page object. * * @param array $form * The Drupal form. * @param array $form_state * The Drupal form state. */ function roblib_update_book_ocr_manage_pages_ocr_form_submit(array $form, array &$form_state) { $object = $form_state['object']; $pages = array_keys(islandora_paged_content_get_pages($object)); roblib_update_book_ocr_batch($pages, $object); } /** * Setup the batch. * * @param Array $pages * A list of pages related to the book * @param string $pid * The PID of the book. */ function roblib_update_book_ocr_batch($pages, $object) { $batch = [ 'title' => t('Updating/Creating Book OCR ...'), 'operations' => [], 'init_message' => t('starting'), 'progress_message' => t('Processed @current out of @total.'), 'error_message' => t('An error occurred during processing'), 'finished' => 'roblib_update_book_ocr_batch_finished', ]; $total = count($pages); foreach ($pages as $page) { $batch['operations'][] = [ '_roblib_update_book_ocr_update_book', [$page, $total, $object->id], ]; } batch_set($batch); //batch_process('islandora/object/' . $object->id); } /** * Handles individual requests from the batch. * * @param array $page * A page from a book. * @param array $context * The batch context. */ function _roblib_update_book_ocr_update_book($page, $total, $parent_pid, &$context) { if (!isset($context['results']['pages_processed'])) { $context['results']['pages_processed'] = 1; $context['results']['ocr'] = ''; } $context['results']['pages_processed']++; if (!isset($context['sandbox']['total'])) { $context['sandbox']['total'] = $total; } if (!isset($context['sandbox']['parent'])){ $context['results']['parent'] = $parent_pid; } $context['message'] = t('Retrieving OCR for page %pid', [ '%pid' => $page, ]); try { // keep as string for now, ocr should be in kilobytes. $context['results']['ocr'] .= roblib_update_book_ocr_get_ocr($page); } catch (Exception $e) { //TODO something $context['message'] = t('Error retrieving OCR for page %pid', [ '%pid' => $page, ]); watchdog('roblib_update_book_ocr', $e->getMessage(), NULL, WATCHDOG_ERROR, NULL); } if ( $context['results']['pages_processed'] != $context['sandbox']['total']) { $context['finished'] = $context['results']['pages_processed'] / $context['sandbox']['total']; } } function roblib_update_book_ocr($parent_id, $ocr) { $dsid = 'OCR'; $parent_object = islandora_object_load($parent_id); if (!isset($parent_object[$dsid])) { $datastream = $parent_object->constructDatastream($dsid); $datastream->label = $dsid; $datastream->mimeType = 'text/plain'; $datastream->content = $ocr; } else { $datastream = $parent_object[$dsid]; $datastream->content = $ocr; } if (!isset($parent_object[$dsid])) { $parent_object->ingestDatastream($datastream); } unset($parent_object); } /** * Prepare a page object for update. * * @param array $page * A page from a book. * * * @throws \Exception */ function roblib_update_book_ocr_get_ocr($page) { $dsid = 'OCR'; $page_object = islandora_object_load($page); if (!isset($page_object[$dsid])) { $ocr = ''; watchdog('roblib_update_book_ocr', t('Object %page did not contain an OCR datastream', array('%page' => $page)), NULL, WATCHDOG_INFO, NULL); } else { $ocr = $page_object[$dsid]->content; } unset($page_object); return $ocr; } /** * @param array $success * @param array $results * @param array $operations * * @return array|mixed|string|null */ function roblib_update_book_ocr_batch_finished($success, $results, $operations) { roblib_update_book_ocr($results['parent'], $results['ocr']); $message = t('Finished Processing OCR for the page objects, processed %total pages', array( '%total' => $results['pages_processed'] - 1, )); drupal_set_message($message, 'info'); return t('Finished Processing Pages.'); }