Browse Source

Merge pull request #897 from Islandora/hocr

Add hOCR option to Text Extraction Media Attachment action and IIIF Manifest
pull/904/merge 2.4.5
Willow Gillingham 2 years ago committed by GitHub
parent
commit
bdbef45baa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 34
      modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php
  2. 29
      modules/islandora_text_extraction/src/Plugin/Action/GenerateOCRDerivativeFile.php

34
modules/islandora_iiif/src/Plugin/views/style/IIIFManifest.php

@ -189,21 +189,27 @@ class IIIFManifest extends StylePluginBase {
*/
protected function getTileSourceFromRow(ResultRow $row, $iiif_address, $iiif_base_id) {
$canvases = [];
foreach ($this->options['iiif_tile_field'] as $iiif_tile_field) {
foreach (array_filter(array_values($this->options['iiif_tile_field'])) as $iiif_tile_field) {
$viewsField = $this->view->field[$iiif_tile_field];
$iiif_ocr_file_field = !empty($this->options['iiif_ocr_file_field']) ? array_filter(array_values($this->options['iiif_ocr_file_field'])) : [];
$ocrField = count($iiif_ocr_file_field) > 0 ? $this->view->field[$iiif_ocr_file_field[0]] : NULL;
$entity = $viewsField->getEntity($row);
if (isset($entity->{$viewsField->definition['field_name']})) {
/** @var \Drupal\Core\Field\FieldItemListInterface $images */
$images = $entity->{$viewsField->definition['field_name']};
foreach ($images as $image) {
foreach ($images as $i => $image) {
if (!$image->entity->access('view')) {
// If the user does not have permission to view the file, skip it.
continue;
}
$ocrs = $entity->{$ocrField->definition['field_name']};
// Create the IIIF URL for this file
// Visiting $iiif_url will resolve to the info.json for the image.
$ocr = isset($ocrs[$i]) ? $ocrs[$i] : FALSE;
$file_url = $image->entity->createFileUrl(FALSE);
$mime_type = $image->entity->getMimeType();
$iiif_url = rtrim($iiif_address, '/') . '/' . urlencode($file_url);
@ -241,8 +247,7 @@ class IIIFManifest extends StylePluginBase {
}
}
}
$canvases[] = [
$tmp_canvas = [
// @see https://iiif.io/api/presentation/2.1/#canvas
'@id' => $canvas_id,
'@type' => 'sc:Canvas',
@ -271,6 +276,17 @@ class IIIFManifest extends StylePluginBase {
],
],
];
if (isset($ocr) && $ocr != FALSE) {
$tmp_canvas['seeAlso'] = [
'@id' => $ocr->entity->createFileUrl(FALSE),
'format' => 'text/vnd.hocr+html',
'profile' => 'http://kba.cloud/hocr-spec',
'label' => 'hOCR embedded text',
];
}
$canvases[] = $tmp_canvas;
}
}
}
@ -313,6 +329,7 @@ class IIIFManifest extends StylePluginBase {
$options = parent::defineOptions();
$options['iiif_tile_field'] = ['default' => ''];
$options['iiif_ocr_file_field'] = ['default' => ''];
return $options;
}
@ -368,6 +385,15 @@ class IIIFManifest extends StylePluginBase {
// otherwise could lock up the form when setting up a View.
'#required' => count($field_options) > 0,
];
$form['iiif_ocr_file_field'] = [
'#title' => $this->t('Structured OCR data file field'),
'#type' => 'checkboxes',
'#default_value' => $this->options['iiif_ocr_file_field'],
'#description' => $this->t('The source of structured OCR text for each entity.'),
'#options' => $field_options,
'#required' => FALSE,
];
}
/**

29
modules/islandora_text_extraction/src/Plugin/Action/GenerateOCRDerivativeFile.php

@ -8,7 +8,7 @@ use Drupal\Core\Url;
use Drupal\islandora\Plugin\Action\AbstractGenerateDerivativeMediaFile;
/**
* Emits a Node for generating fits derivatives event.
* Generates OCR derivatives event.
*
* @Action(
* id = "generate_extracted_text_file",
@ -29,6 +29,7 @@ class GenerateOCRDerivativeFile extends AbstractGenerateDerivativeMediaFile {
$config['destination_media_type'] = 'file';
$config['scheme'] = $this->config->get('default_scheme');
$config['destination_text_field_name'] = '';
$config['text_format'] = 'plain_text';
return $config;
}
@ -38,7 +39,7 @@ class GenerateOCRDerivativeFile extends AbstractGenerateDerivativeMediaFile {
public function buildConfigurationForm(array $form, FormStateInterface $form_state) {
$map = $this->entityFieldManager->getFieldMapByFieldType('text_long');
$file_fields = $map['media'];
$field_options = array_combine(array_keys($file_fields), array_keys($file_fields));
$field_options = ['none' => $this->t('None')] + array_combine(array_keys($file_fields), array_keys($file_fields));
$form = parent::buildConfigurationForm($form, $form_state);
$form['mimetype']['#description'] = $this->t('Mimetype to convert to (e.g. application/xml, etc...)');
$form['mimetype']['#value'] = 'text/plain';
@ -48,13 +49,23 @@ class GenerateOCRDerivativeFile extends AbstractGenerateDerivativeMediaFile {
$last = array_slice($form, count($form) - $position + 1);
$middle['destination_text_field_name'] = [
'#required' => TRUE,
'#required' => FALSE,
'#type' => 'select',
'#options' => $field_options,
'#title' => $this->t('Destination Text field Name'),
'#default_value' => $this->configuration['destination_text_field_name'],
'#description' => $this->t('Text field on Media Type to hold extracted text.'),
];
$middle['text_format'] = [
'#type' => 'select',
'#title' => $this->t('Format'),
'#options' => [
'plain_text' => $this->t('Plain text'),
'hocr' => $this->t('hOCR text with positional data'),
],
'#default_value' => $this->configuration['text_format'],
'#description' => $this->t("The type of text to be returned."),
];
$form = array_merge($first, $middle, $last);
unset($form['args']);
@ -81,17 +92,29 @@ class GenerateOCRDerivativeFile extends AbstractGenerateDerivativeMediaFile {
public function submitConfigurationForm(array &$form, FormStateInterface $form_state) {
parent::submitConfigurationForm($form, $form_state);
$this->configuration['destination_text_field_name'] = $form_state->getValue('destination_text_field_name');
$this->configuration['text_format'] = $form_state->getValue('text_format');
switch ($form_state->getValue('text_format')) {
case 'hocr':
$this->configuration['args'] = '-c tessedit_create_hocr=1 -c hocr_font_info=0';
break;
case 'plain_text':
$his->configuration['args'] = '';
break;
}
}
/**
* Override this to return arbitrary data as an array to be json encoded.
*/
protected function generateData(EntityInterface $entity) {
$data = parent::generateData($entity);
$route_params = [
'media' => $entity->id(),
'destination_field' => $this->configuration['destination_field_name'],
'destination_text_field' => $this->configuration['destination_text_field_name'],
'text_format' => $this->configuration['text_format'],
];
$data['destination_uri'] = Url::fromRoute('islandora_text_extraction.attach_file_to_media', $route_params)
->setAbsolute()

Loading…
Cancel
Save