dannylamb
5 years ago
committed by
Seth Shaw
22 changed files with 1021 additions and 0 deletions
@ -0,0 +1,23 @@ |
|||||||
|
langcode: en |
||||||
|
status: true |
||||||
|
dependencies: |
||||||
|
module: |
||||||
|
- field_permissions |
||||||
|
- node |
||||||
|
third_party_settings: |
||||||
|
field_permissions: |
||||||
|
permission_type: public |
||||||
|
id: node.field_weight |
||||||
|
field_name: field_weight |
||||||
|
entity_type: node |
||||||
|
type: integer |
||||||
|
settings: |
||||||
|
unsigned: false |
||||||
|
size: normal |
||||||
|
module: core |
||||||
|
locked: false |
||||||
|
cardinality: 1 |
||||||
|
translatable: true |
||||||
|
indexes: { } |
||||||
|
persist_with_no_fields: false |
||||||
|
custom_storage: false |
@ -0,0 +1,339 @@ |
|||||||
|
GNU GENERAL PUBLIC LICENSE |
||||||
|
Version 2, June 1991 |
||||||
|
|
||||||
|
Copyright (C) 1989, 1991 Free Software Foundation, Inc., |
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||||||
|
Everyone is permitted to copy and distribute verbatim copies |
||||||
|
of this license document, but changing it is not allowed. |
||||||
|
|
||||||
|
Preamble |
||||||
|
|
||||||
|
The licenses for most software are designed to take away your |
||||||
|
freedom to share and change it. By contrast, the GNU General Public |
||||||
|
License is intended to guarantee your freedom to share and change free |
||||||
|
software--to make sure the software is free for all its users. This |
||||||
|
General Public License applies to most of the Free Software |
||||||
|
Foundation's software and to any other program whose authors commit to |
||||||
|
using it. (Some other Free Software Foundation software is covered by |
||||||
|
the GNU Lesser General Public License instead.) You can apply it to |
||||||
|
your programs, too. |
||||||
|
|
||||||
|
When we speak of free software, we are referring to freedom, not |
||||||
|
price. Our General Public Licenses are designed to make sure that you |
||||||
|
have the freedom to distribute copies of free software (and charge for |
||||||
|
this service if you wish), that you receive source code or can get it |
||||||
|
if you want it, that you can change the software or use pieces of it |
||||||
|
in new free programs; and that you know you can do these things. |
||||||
|
|
||||||
|
To protect your rights, we need to make restrictions that forbid |
||||||
|
anyone to deny you these rights or to ask you to surrender the rights. |
||||||
|
These restrictions translate to certain responsibilities for you if you |
||||||
|
distribute copies of the software, or if you modify it. |
||||||
|
|
||||||
|
For example, if you distribute copies of such a program, whether |
||||||
|
gratis or for a fee, you must give the recipients all the rights that |
||||||
|
you have. You must make sure that they, too, receive or can get the |
||||||
|
source code. And you must show them these terms so they know their |
||||||
|
rights. |
||||||
|
|
||||||
|
We protect your rights with two steps: (1) copyright the software, and |
||||||
|
(2) offer you this license which gives you legal permission to copy, |
||||||
|
distribute and/or modify the software. |
||||||
|
|
||||||
|
Also, for each author's protection and ours, we want to make certain |
||||||
|
that everyone understands that there is no warranty for this free |
||||||
|
software. If the software is modified by someone else and passed on, we |
||||||
|
want its recipients to know that what they have is not the original, so |
||||||
|
that any problems introduced by others will not reflect on the original |
||||||
|
authors' reputations. |
||||||
|
|
||||||
|
Finally, any free program is threatened constantly by software |
||||||
|
patents. We wish to avoid the danger that redistributors of a free |
||||||
|
program will individually obtain patent licenses, in effect making the |
||||||
|
program proprietary. To prevent this, we have made it clear that any |
||||||
|
patent must be licensed for everyone's free use or not licensed at all. |
||||||
|
|
||||||
|
The precise terms and conditions for copying, distribution and |
||||||
|
modification follow. |
||||||
|
|
||||||
|
GNU GENERAL PUBLIC LICENSE |
||||||
|
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION |
||||||
|
|
||||||
|
0. This License applies to any program or other work which contains |
||||||
|
a notice placed by the copyright holder saying it may be distributed |
||||||
|
under the terms of this General Public License. The "Program", below, |
||||||
|
refers to any such program or work, and a "work based on the Program" |
||||||
|
means either the Program or any derivative work under copyright law: |
||||||
|
that is to say, a work containing the Program or a portion of it, |
||||||
|
either verbatim or with modifications and/or translated into another |
||||||
|
language. (Hereinafter, translation is included without limitation in |
||||||
|
the term "modification".) Each licensee is addressed as "you". |
||||||
|
|
||||||
|
Activities other than copying, distribution and modification are not |
||||||
|
covered by this License; they are outside its scope. The act of |
||||||
|
running the Program is not restricted, and the output from the Program |
||||||
|
is covered only if its contents constitute a work based on the |
||||||
|
Program (independent of having been made by running the Program). |
||||||
|
Whether that is true depends on what the Program does. |
||||||
|
|
||||||
|
1. You may copy and distribute verbatim copies of the Program's |
||||||
|
source code as you receive it, in any medium, provided that you |
||||||
|
conspicuously and appropriately publish on each copy an appropriate |
||||||
|
copyright notice and disclaimer of warranty; keep intact all the |
||||||
|
notices that refer to this License and to the absence of any warranty; |
||||||
|
and give any other recipients of the Program a copy of this License |
||||||
|
along with the Program. |
||||||
|
|
||||||
|
You may charge a fee for the physical act of transferring a copy, and |
||||||
|
you may at your option offer warranty protection in exchange for a fee. |
||||||
|
|
||||||
|
2. You may modify your copy or copies of the Program or any portion |
||||||
|
of it, thus forming a work based on the Program, and copy and |
||||||
|
distribute such modifications or work under the terms of Section 1 |
||||||
|
above, provided that you also meet all of these conditions: |
||||||
|
|
||||||
|
a) You must cause the modified files to carry prominent notices |
||||||
|
stating that you changed the files and the date of any change. |
||||||
|
|
||||||
|
b) You must cause any work that you distribute or publish, that in |
||||||
|
whole or in part contains or is derived from the Program or any |
||||||
|
part thereof, to be licensed as a whole at no charge to all third |
||||||
|
parties under the terms of this License. |
||||||
|
|
||||||
|
c) If the modified program normally reads commands interactively |
||||||
|
when run, you must cause it, when started running for such |
||||||
|
interactive use in the most ordinary way, to print or display an |
||||||
|
announcement including an appropriate copyright notice and a |
||||||
|
notice that there is no warranty (or else, saying that you provide |
||||||
|
a warranty) and that users may redistribute the program under |
||||||
|
these conditions, and telling the user how to view a copy of this |
||||||
|
License. (Exception: if the Program itself is interactive but |
||||||
|
does not normally print such an announcement, your work based on |
||||||
|
the Program is not required to print an announcement.) |
||||||
|
|
||||||
|
These requirements apply to the modified work as a whole. If |
||||||
|
identifiable sections of that work are not derived from the Program, |
||||||
|
and can be reasonably considered independent and separate works in |
||||||
|
themselves, then this License, and its terms, do not apply to those |
||||||
|
sections when you distribute them as separate works. But when you |
||||||
|
distribute the same sections as part of a whole which is a work based |
||||||
|
on the Program, the distribution of the whole must be on the terms of |
||||||
|
this License, whose permissions for other licensees extend to the |
||||||
|
entire whole, and thus to each and every part regardless of who wrote it. |
||||||
|
|
||||||
|
Thus, it is not the intent of this section to claim rights or contest |
||||||
|
your rights to work written entirely by you; rather, the intent is to |
||||||
|
exercise the right to control the distribution of derivative or |
||||||
|
collective works based on the Program. |
||||||
|
|
||||||
|
In addition, mere aggregation of another work not based on the Program |
||||||
|
with the Program (or with a work based on the Program) on a volume of |
||||||
|
a storage or distribution medium does not bring the other work under |
||||||
|
the scope of this License. |
||||||
|
|
||||||
|
3. You may copy and distribute the Program (or a work based on it, |
||||||
|
under Section 2) in object code or executable form under the terms of |
||||||
|
Sections 1 and 2 above provided that you also do one of the following: |
||||||
|
|
||||||
|
a) Accompany it with the complete corresponding machine-readable |
||||||
|
source code, which must be distributed under the terms of Sections |
||||||
|
1 and 2 above on a medium customarily used for software interchange; or, |
||||||
|
|
||||||
|
b) Accompany it with a written offer, valid for at least three |
||||||
|
years, to give any third party, for a charge no more than your |
||||||
|
cost of physically performing source distribution, a complete |
||||||
|
machine-readable copy of the corresponding source code, to be |
||||||
|
distributed under the terms of Sections 1 and 2 above on a medium |
||||||
|
customarily used for software interchange; or, |
||||||
|
|
||||||
|
c) Accompany it with the information you received as to the offer |
||||||
|
to distribute corresponding source code. (This alternative is |
||||||
|
allowed only for noncommercial distribution and only if you |
||||||
|
received the program in object code or executable form with such |
||||||
|
an offer, in accord with Subsection b above.) |
||||||
|
|
||||||
|
The source code for a work means the preferred form of the work for |
||||||
|
making modifications to it. For an executable work, complete source |
||||||
|
code means all the source code for all modules it contains, plus any |
||||||
|
associated interface definition files, plus the scripts used to |
||||||
|
control compilation and installation of the executable. However, as a |
||||||
|
special exception, the source code distributed need not include |
||||||
|
anything that is normally distributed (in either source or binary |
||||||
|
form) with the major components (compiler, kernel, and so on) of the |
||||||
|
operating system on which the executable runs, unless that component |
||||||
|
itself accompanies the executable. |
||||||
|
|
||||||
|
If distribution of executable or object code is made by offering |
||||||
|
access to copy from a designated place, then offering equivalent |
||||||
|
access to copy the source code from the same place counts as |
||||||
|
distribution of the source code, even though third parties are not |
||||||
|
compelled to copy the source along with the object code. |
||||||
|
|
||||||
|
4. You may not copy, modify, sublicense, or distribute the Program |
||||||
|
except as expressly provided under this License. Any attempt |
||||||
|
otherwise to copy, modify, sublicense or distribute the Program is |
||||||
|
void, and will automatically terminate your rights under this License. |
||||||
|
However, parties who have received copies, or rights, from you under |
||||||
|
this License will not have their licenses terminated so long as such |
||||||
|
parties remain in full compliance. |
||||||
|
|
||||||
|
5. You are not required to accept this License, since you have not |
||||||
|
signed it. However, nothing else grants you permission to modify or |
||||||
|
distribute the Program or its derivative works. These actions are |
||||||
|
prohibited by law if you do not accept this License. Therefore, by |
||||||
|
modifying or distributing the Program (or any work based on the |
||||||
|
Program), you indicate your acceptance of this License to do so, and |
||||||
|
all its terms and conditions for copying, distributing or modifying |
||||||
|
the Program or works based on it. |
||||||
|
|
||||||
|
6. Each time you redistribute the Program (or any work based on the |
||||||
|
Program), the recipient automatically receives a license from the |
||||||
|
original licensor to copy, distribute or modify the Program subject to |
||||||
|
these terms and conditions. You may not impose any further |
||||||
|
restrictions on the recipients' exercise of the rights granted herein. |
||||||
|
You are not responsible for enforcing compliance by third parties to |
||||||
|
this License. |
||||||
|
|
||||||
|
7. If, as a consequence of a court judgment or allegation of patent |
||||||
|
infringement or for any other reason (not limited to patent issues), |
||||||
|
conditions are imposed on you (whether by court order, agreement or |
||||||
|
otherwise) that contradict the conditions of this License, they do not |
||||||
|
excuse you from the conditions of this License. If you cannot |
||||||
|
distribute so as to satisfy simultaneously your obligations under this |
||||||
|
License and any other pertinent obligations, then as a consequence you |
||||||
|
may not distribute the Program at all. For example, if a patent |
||||||
|
license would not permit royalty-free redistribution of the Program by |
||||||
|
all those who receive copies directly or indirectly through you, then |
||||||
|
the only way you could satisfy both it and this License would be to |
||||||
|
refrain entirely from distribution of the Program. |
||||||
|
|
||||||
|
If any portion of this section is held invalid or unenforceable under |
||||||
|
any particular circumstance, the balance of the section is intended to |
||||||
|
apply and the section as a whole is intended to apply in other |
||||||
|
circumstances. |
||||||
|
|
||||||
|
It is not the purpose of this section to induce you to infringe any |
||||||
|
patents or other property right claims or to contest validity of any |
||||||
|
such claims; this section has the sole purpose of protecting the |
||||||
|
integrity of the free software distribution system, which is |
||||||
|
implemented by public license practices. Many people have made |
||||||
|
generous contributions to the wide range of software distributed |
||||||
|
through that system in reliance on consistent application of that |
||||||
|
system; it is up to the author/donor to decide if he or she is willing |
||||||
|
to distribute software through any other system and a licensee cannot |
||||||
|
impose that choice. |
||||||
|
|
||||||
|
This section is intended to make thoroughly clear what is believed to |
||||||
|
be a consequence of the rest of this License. |
||||||
|
|
||||||
|
8. If the distribution and/or use of the Program is restricted in |
||||||
|
certain countries either by patents or by copyrighted interfaces, the |
||||||
|
original copyright holder who places the Program under this License |
||||||
|
may add an explicit geographical distribution limitation excluding |
||||||
|
those countries, so that distribution is permitted only in or among |
||||||
|
countries not thus excluded. In such case, this License incorporates |
||||||
|
the limitation as if written in the body of this License. |
||||||
|
|
||||||
|
9. The Free Software Foundation may publish revised and/or new versions |
||||||
|
of the General Public License from time to time. Such new versions will |
||||||
|
be similar in spirit to the present version, but may differ in detail to |
||||||
|
address new problems or concerns. |
||||||
|
|
||||||
|
Each version is given a distinguishing version number. If the Program |
||||||
|
specifies a version number of this License which applies to it and "any |
||||||
|
later version", you have the option of following the terms and conditions |
||||||
|
either of that version or of any later version published by the Free |
||||||
|
Software Foundation. If the Program does not specify a version number of |
||||||
|
this License, you may choose any version ever published by the Free Software |
||||||
|
Foundation. |
||||||
|
|
||||||
|
10. If you wish to incorporate parts of the Program into other free |
||||||
|
programs whose distribution conditions are different, write to the author |
||||||
|
to ask for permission. For software which is copyrighted by the Free |
||||||
|
Software Foundation, write to the Free Software Foundation; we sometimes |
||||||
|
make exceptions for this. Our decision will be guided by the two goals |
||||||
|
of preserving the free status of all derivatives of our free software and |
||||||
|
of promoting the sharing and reuse of software generally. |
||||||
|
|
||||||
|
NO WARRANTY |
||||||
|
|
||||||
|
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY |
||||||
|
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN |
||||||
|
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES |
||||||
|
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED |
||||||
|
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
||||||
|
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS |
||||||
|
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE |
||||||
|
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, |
||||||
|
REPAIR OR CORRECTION. |
||||||
|
|
||||||
|
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING |
||||||
|
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR |
||||||
|
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, |
||||||
|
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING |
||||||
|
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED |
||||||
|
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY |
||||||
|
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER |
||||||
|
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE |
||||||
|
POSSIBILITY OF SUCH DAMAGES. |
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS |
||||||
|
|
||||||
|
How to Apply These Terms to Your New Programs |
||||||
|
|
||||||
|
If you develop a new program, and you want it to be of the greatest |
||||||
|
possible use to the public, the best way to achieve this is to make it |
||||||
|
free software which everyone can redistribute and change under these terms. |
||||||
|
|
||||||
|
To do so, attach the following notices to the program. It is safest |
||||||
|
to attach them to the start of each source file to most effectively |
||||||
|
convey the exclusion of warranty; and each file should have at least |
||||||
|
the "copyright" line and a pointer to where the full notice is found. |
||||||
|
|
||||||
|
<one line to give the program's name and a brief idea of what it does.> |
||||||
|
Copyright (C) <year> <name of author> |
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify |
||||||
|
it under the terms of the GNU General Public License as published by |
||||||
|
the Free Software Foundation; either version 2 of the License, or |
||||||
|
(at your option) any later version. |
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful, |
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||||
|
GNU General Public License for more details. |
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along |
||||||
|
with this program; if not, write to the Free Software Foundation, Inc., |
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
||||||
|
|
||||||
|
Also add information on how to contact you by electronic and paper mail. |
||||||
|
|
||||||
|
If the program is interactive, make it output a short notice like this |
||||||
|
when it starts in an interactive mode: |
||||||
|
|
||||||
|
Gnomovision version 69, Copyright (C) year name of author |
||||||
|
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. |
||||||
|
This is free software, and you are welcome to redistribute it |
||||||
|
under certain conditions; type `show c' for details. |
||||||
|
|
||||||
|
The hypothetical commands `show w' and `show c' should show the appropriate |
||||||
|
parts of the General Public License. Of course, the commands you use may |
||||||
|
be called something other than `show w' and `show c'; they could even be |
||||||
|
mouse-clicks or menu items--whatever suits your program. |
||||||
|
|
||||||
|
You should also get your employer (if you work as a programmer) or your |
||||||
|
school, if any, to sign a "copyright disclaimer" for the program, if |
||||||
|
necessary. Here is a sample; alter the names: |
||||||
|
|
||||||
|
Yoyodyne, Inc., hereby disclaims all copyright interest in the program |
||||||
|
`Gnomovision' (which makes passes at compilers) written by James Hacker. |
||||||
|
|
||||||
|
<signature of Ty Coon>, 1 April 1989 |
||||||
|
Ty Coon, President of Vice |
||||||
|
|
||||||
|
This General Public License does not permit incorporating your program into |
||||||
|
proprietary programs. If your program is a subroutine library, you may |
||||||
|
consider it more useful to permit linking proprietary applications with the |
||||||
|
library. If this is what you want to do, use the GNU Lesser General |
||||||
|
Public License instead of this License. |
@ -0,0 +1,44 @@ |
|||||||
|
# islandora_text_extraction |
||||||
|
### Connects Islandora 8 to Hypercube microservice and extracts text from PDFs |
||||||
|
|
||||||
|
Install module in the usual way, |
||||||
|
then copy `assets/ca.islandora.alpaca.connector.ocr.blueprint.xml` |
||||||
|
to `/opt/karaf/deploy` on the server. |
||||||
|
_note:_ This config file assumes a url of `http://localhost:8000/hypercube`. |
||||||
|
If your service is found elsewhere this must be changed. |
||||||
|
There is no need to restart. |
||||||
|
|
||||||
|
In the usual Ansible build this will require no modification. |
||||||
|
|
||||||
|
If a parent node is tagged as `Digital Document` an `Image` tagged media |
||||||
|
will extract text from that image at the time of ingestion. |
||||||
|
The content type of the parent node should be configured to allow multiple tags. |
||||||
|
|
||||||
|
_note:_ Media are linked to their parent nodes with the `Media Of` |
||||||
|
entity reference field. If you wish to attach the PDF (or any other ) media type |
||||||
|
to a parent node which has any content type other than Repository Item |
||||||
|
(islandora_object) the parent content type will have to be added to the `Media Of` |
||||||
|
field in the media type description. |
||||||
|
|
||||||
|
## Prepare module for PDF text extraction |
||||||
|
Install `texttopdf` on your server if not already present. |
||||||
|
On an ubuntu/debian machine like the default claw playbook run |
||||||
|
`sudo apt-get install poppler-utils` |
||||||
|
|
||||||
|
test to see its been properly installed with `which pdftotext` |
||||||
|
|
||||||
|
Install php libraries with `composer require spatie/pdf-to-text` |
||||||
|
|
||||||
|
In the unlikely event that your `pdftotext` binary exists on your server |
||||||
|
outside of the system path, the path to the binary can be set at |
||||||
|
`/admin/config/islandora/text_extraction`. |
||||||
|
|
||||||
|
## Using text extraction ## |
||||||
|
The containing document must be tagged as `Digital Document`, |
||||||
|
and the media must be tagged as `Original File`. |
||||||
|
A new editable `Extracted Text` media will be created and attached when `PDF` or |
||||||
|
`Image` media types are added to a node. |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,7 @@ |
|||||||
|
name: 'Islandora Text Extraction' |
||||||
|
type: module |
||||||
|
description: 'Islandora 8 module to connect to Hypercube microservice, and to get text from PDF ingest' |
||||||
|
core: 8.x |
||||||
|
package: 'Islandora' |
||||||
|
dependencies: |
||||||
|
- islandora |
@ -0,0 +1,23 @@ |
|||||||
|
<?php |
||||||
|
|
||||||
|
/** |
||||||
|
* @file |
||||||
|
* Install/update hook implementations. |
||||||
|
*/ |
||||||
|
|
||||||
|
use Drupal\field\Entity\FieldConfig; |
||||||
|
|
||||||
|
/** |
||||||
|
* Implements hook_install(). |
||||||
|
*/ |
||||||
|
function islandora_text_extraction_install() { |
||||||
|
// Add txt extension if it doesn't already exist;. |
||||||
|
$field = FieldConfig::load("media.file.field_media_file"); |
||||||
|
$fieldSettings = $field->getSettings(); |
||||||
|
$extensions = $fieldSettings['file_extensions']; |
||||||
|
if (!strpos($extensions, 'txt')) { |
||||||
|
$fieldSettings['file_extensions'] .= ' txt'; |
||||||
|
$field->set('settings', $fieldSettings); |
||||||
|
$field->save(); |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,53 @@ |
|||||||
|
<?php |
||||||
|
|
||||||
|
/** |
||||||
|
* @file |
||||||
|
* Contains islandora_text_extraction.module. |
||||||
|
*/ |
||||||
|
|
||||||
|
use Drupal\Core\Form\FormStateInterface; |
||||||
|
use Drupal\Core\Routing\RouteMatchInterface; |
||||||
|
use Drupal\file\Entity\File; |
||||||
|
use Drupal\media\MediaInterface; |
||||||
|
|
||||||
|
/** |
||||||
|
* Implements hook_help(). |
||||||
|
*/ |
||||||
|
function islandora_text_extraction_help($route_name, RouteMatchInterface $route_match) { |
||||||
|
switch ($route_name) { |
||||||
|
// Main module help for the islandora_text_extraction module. |
||||||
|
case 'help.page.islandora_text_extraction': |
||||||
|
$output = ''; |
||||||
|
$output .= '<h3>' . t('About') . '</h3>'; |
||||||
|
$output .= '<p>' . t('Islandora 8 module to connect to Hypercube microservice') . '</p>'; |
||||||
|
return $output; |
||||||
|
|
||||||
|
default: |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Implements hook_media_presave(). |
||||||
|
*/ |
||||||
|
function islandora_text_extraction_media_presave(MediaInterface $media) { |
||||||
|
if ($media->bundle() != 'extracted_text') { |
||||||
|
return; |
||||||
|
} |
||||||
|
$text = $media->get('field_edited_text')->getValue(); |
||||||
|
if (!$text) { |
||||||
|
$file_id = $media->get('field_media_file')->getValue()[0]['target_id']; |
||||||
|
$file = File::load($file_id); |
||||||
|
$data = file_get_contents($file->getFileUri()); |
||||||
|
$data = nl2br($data); |
||||||
|
$media->set('field_edited_text', $data); |
||||||
|
$media->field_edited_text->format = 'basic_html'; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Implements hook_form_form_id_alter(). |
||||||
|
*/ |
||||||
|
function islandora_text_extraction_form_block_form_alter(&$form, FormStateInterface $form_state, $form_id) { |
||||||
|
unset($form['visibility']['ocr_requested']); |
||||||
|
unset($form['visibility']['pdf_text_extraction_requested']); |
||||||
|
} |
@ -0,0 +1,58 @@ |
|||||||
|
<?php |
||||||
|
|
||||||
|
namespace Drupal\islandora_text_extraction\Plugin\Action; |
||||||
|
|
||||||
|
use Drupal\Core\Form\FormStateInterface; |
||||||
|
use Drupal\islandora\Plugin\Action\AbstractGenerateDerivative; |
||||||
|
|
||||||
|
/** |
||||||
|
* Emits a Node for generating OCR derivatives event. |
||||||
|
* |
||||||
|
* @Action( |
||||||
|
* id = "generate_ocr_derivative", |
||||||
|
* label = @Translation("Get OCR from image"), |
||||||
|
* type = "node" |
||||||
|
* ) |
||||||
|
*/ |
||||||
|
class GenerateOCRDerivative extends AbstractGenerateDerivative { |
||||||
|
|
||||||
|
/** |
||||||
|
* {@inheritdoc} |
||||||
|
*/ |
||||||
|
public function defaultConfiguration() { |
||||||
|
$config = parent::defaultConfiguration(); |
||||||
|
$config['path'] = '[date:custom:Y]-[date:custom:m]/[node:nid]-[term:name].txt'; |
||||||
|
$config['mimetype'] = 'application/xml'; |
||||||
|
$config['queue'] = 'islandora-connector-ocr'; |
||||||
|
$config['destination_media_type'] = 'file'; |
||||||
|
return $config; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* {@inheritdoc} |
||||||
|
*/ |
||||||
|
public function buildConfigurationForm(array $form, FormStateInterface $form_state) { |
||||||
|
$form = parent::buildConfigurationForm($form, $form_state); |
||||||
|
$form['mimetype']['#description'] = t('Mimetype to convert to (e.g. application/xml, etc...)'); |
||||||
|
$form['mimetype']['#value'] = 'text/plain'; |
||||||
|
$form['mimetype']['#type'] = 'textfield'; |
||||||
|
|
||||||
|
unset($form['args']); |
||||||
|
return $form; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* {@inheritdoc} |
||||||
|
*/ |
||||||
|
public function validateConfigurationForm(array &$form, FormStateInterface $form_state) { |
||||||
|
parent::validateConfigurationForm($form, $form_state); |
||||||
|
$exploded_mime = explode('/', $form_state->getValue('mimetype')); |
||||||
|
if ($exploded_mime[0] != 'text') { |
||||||
|
$form_state->setErrorByName( |
||||||
|
'mimetype', |
||||||
|
t('Please enter file mimetype (e.g. text/plain.)') |
||||||
|
); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
} |
@ -0,0 +1,82 @@ |
|||||||
|
<?php |
||||||
|
|
||||||
|
namespace Drupal\islandora_text_extraction\Plugin\Field\FieldFormatter; |
||||||
|
|
||||||
|
use Drupal\Core\Field\FieldItemInterface; |
||||||
|
use Drupal\Core\Field\FieldItemListInterface; |
||||||
|
use Drupal\Core\Field\FormatterBase; |
||||||
|
use Drupal\Core\Form\FormStateInterface; |
||||||
|
use Drupal\file\Entity\File; |
||||||
|
|
||||||
|
/** |
||||||
|
* Plugin implementation of the 'ocr_txt_formatter' formatter. |
||||||
|
* |
||||||
|
* @FieldFormatter( |
||||||
|
* id = "ocr_formatter", |
||||||
|
* label = @Translation("OCRed text formatter"), |
||||||
|
* field_types = {"file"} |
||||||
|
* ) |
||||||
|
*/ |
||||||
|
class OcrTextFormatter extends FormatterBase { |
||||||
|
|
||||||
|
/** |
||||||
|
* {@inheritdoc} |
||||||
|
*/ |
||||||
|
public static function defaultSettings() { |
||||||
|
return [ |
||||||
|
// Implement default settings. |
||||||
|
] + parent::defaultSettings(); |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* {@inheritdoc} |
||||||
|
*/ |
||||||
|
public function settingsForm(array $form, FormStateInterface $form_state) { |
||||||
|
return [ |
||||||
|
// Implement settings form. |
||||||
|
] + parent::settingsForm($form, $form_state); |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* {@inheritdoc} |
||||||
|
*/ |
||||||
|
public function settingsSummary() { |
||||||
|
$summary = []; |
||||||
|
// Implement settings summary. |
||||||
|
return $summary; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* {@inheritdoc} |
||||||
|
*/ |
||||||
|
public function viewElements(FieldItemListInterface $items, $langcode) { |
||||||
|
$elements = []; |
||||||
|
|
||||||
|
foreach ($items as $delta => $item) { |
||||||
|
$elements[$delta] = ['#markup' => $this->viewValue($item)]; |
||||||
|
} |
||||||
|
|
||||||
|
return $elements; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Generate the output appropriate for one field item. |
||||||
|
* |
||||||
|
* @param \Drupal\Core\Field\FieldItemInterface $item |
||||||
|
* One field item. |
||||||
|
* |
||||||
|
* @return string |
||||||
|
* The textual output generated. |
||||||
|
*/ |
||||||
|
protected function viewValue(FieldItemInterface $item) { |
||||||
|
$fileItem = $item->getValue(); |
||||||
|
$file = File::load($fileItem['target_id']); |
||||||
|
$contents = file_get_contents($file->getFileUri()); |
||||||
|
if (mb_detect_encoding($contents) != 'UTF-8') { |
||||||
|
$contents = utf8_encode($contents); |
||||||
|
} |
||||||
|
$contents = nl2br($contents); |
||||||
|
return $contents; |
||||||
|
} |
||||||
|
|
||||||
|
} |
@ -0,0 +1,46 @@ |
|||||||
|
<?php |
||||||
|
|
||||||
|
namespace Drupal\Tests\islandora_text_extraction\Functional; |
||||||
|
|
||||||
|
use Drupal\Core\Url; |
||||||
|
use Drupal\Tests\BrowserTestBase; |
||||||
|
|
||||||
|
/** |
||||||
|
* Simple test to ensure that main page loads with module enabled. |
||||||
|
* |
||||||
|
* @group islandora_text_extraction |
||||||
|
*/ |
||||||
|
class LoadTest extends BrowserTestBase { |
||||||
|
|
||||||
|
/** |
||||||
|
* Modules to enable. |
||||||
|
* |
||||||
|
* @var array |
||||||
|
*/ |
||||||
|
public static $modules = ['islandora_text_extraction']; |
||||||
|
|
||||||
|
/** |
||||||
|
* A user with permission to administer site configuration. |
||||||
|
* |
||||||
|
* @var \Drupal\user\UserInterface |
||||||
|
*/ |
||||||
|
protected $user; |
||||||
|
|
||||||
|
/** |
||||||
|
* {@inheritdoc} |
||||||
|
*/ |
||||||
|
protected function setUp() { |
||||||
|
parent::setUp(); |
||||||
|
$this->user = $this->drupalCreateUser(['administer site configuration']); |
||||||
|
$this->drupalLogin($this->user); |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Tests that the home page loads with a 200 response. |
||||||
|
*/ |
||||||
|
public function testLoad() { |
||||||
|
$this->drupalGet(Url::fromRoute('<front>')); |
||||||
|
$this->assertSession()->statusCodeEquals(200); |
||||||
|
} |
||||||
|
|
||||||
|
} |
@ -0,0 +1,81 @@ |
|||||||
|
langcode: en |
||||||
|
status: true |
||||||
|
dependencies: |
||||||
|
config: |
||||||
|
- field.field.media.extracted_text.field_edited_text |
||||||
|
- field.field.media.extracted_text.field_media_file |
||||||
|
- field.field.media.extracted_text.field_media_of |
||||||
|
- field.field.media.extracted_text.field_media_use |
||||||
|
- field.field.media.extracted_text.field_mime_type |
||||||
|
- media.type.extracted_text |
||||||
|
module: |
||||||
|
- file |
||||||
|
- path |
||||||
|
- text |
||||||
|
id: media.extracted_text.default |
||||||
|
targetEntityType: media |
||||||
|
bundle: extracted_text |
||||||
|
mode: default |
||||||
|
content: |
||||||
|
created: |
||||||
|
type: datetime_timestamp |
||||||
|
weight: 3 |
||||||
|
region: content |
||||||
|
settings: { } |
||||||
|
third_party_settings: { } |
||||||
|
field_edited_text: |
||||||
|
type: text_textarea |
||||||
|
weight: 7 |
||||||
|
region: content |
||||||
|
settings: |
||||||
|
rows: 5 |
||||||
|
placeholder: '' |
||||||
|
third_party_settings: { } |
||||||
|
field_media_file: |
||||||
|
type: file_generic |
||||||
|
weight: 6 |
||||||
|
region: content |
||||||
|
settings: |
||||||
|
progress_indicator: throbber |
||||||
|
third_party_settings: { } |
||||||
|
langcode: |
||||||
|
type: language_select |
||||||
|
weight: 1 |
||||||
|
region: content |
||||||
|
settings: |
||||||
|
include_locked: true |
||||||
|
third_party_settings: { } |
||||||
|
name: |
||||||
|
type: string_textfield |
||||||
|
weight: 0 |
||||||
|
region: content |
||||||
|
settings: |
||||||
|
size: 60 |
||||||
|
placeholder: '' |
||||||
|
third_party_settings: { } |
||||||
|
path: |
||||||
|
type: path |
||||||
|
weight: 4 |
||||||
|
region: content |
||||||
|
settings: { } |
||||||
|
third_party_settings: { } |
||||||
|
status: |
||||||
|
type: boolean_checkbox |
||||||
|
settings: |
||||||
|
display_label: true |
||||||
|
weight: 5 |
||||||
|
region: content |
||||||
|
third_party_settings: { } |
||||||
|
uid: |
||||||
|
type: entity_reference_autocomplete |
||||||
|
weight: 2 |
||||||
|
settings: |
||||||
|
match_operator: CONTAINS |
||||||
|
size: 60 |
||||||
|
placeholder: '' |
||||||
|
region: content |
||||||
|
third_party_settings: { } |
||||||
|
hidden: |
||||||
|
field_media_of: true |
||||||
|
field_media_use: true |
||||||
|
field_mime_type: true |
@ -0,0 +1,65 @@ |
|||||||
|
langcode: en |
||||||
|
status: true |
||||||
|
dependencies: |
||||||
|
config: |
||||||
|
- field.field.media.extracted_text.field_edited_text |
||||||
|
- field.field.media.extracted_text.field_media_file |
||||||
|
- field.field.media.extracted_text.field_media_of |
||||||
|
- field.field.media.extracted_text.field_media_use |
||||||
|
- field.field.media.extracted_text.field_mime_type |
||||||
|
- media.type.extracted_text |
||||||
|
module: |
||||||
|
- file |
||||||
|
- text |
||||||
|
- user |
||||||
|
id: media.extracted_text.default |
||||||
|
targetEntityType: media |
||||||
|
bundle: extracted_text |
||||||
|
mode: default |
||||||
|
content: |
||||||
|
created: |
||||||
|
label: hidden |
||||||
|
type: timestamp |
||||||
|
weight: 1 |
||||||
|
region: content |
||||||
|
settings: |
||||||
|
date_format: medium |
||||||
|
custom_date_format: '' |
||||||
|
timezone: '' |
||||||
|
third_party_settings: { } |
||||||
|
field_edited_text: |
||||||
|
type: text_default |
||||||
|
weight: 3 |
||||||
|
region: content |
||||||
|
label: above |
||||||
|
settings: { } |
||||||
|
third_party_settings: { } |
||||||
|
field_media_file: |
||||||
|
type: file_default |
||||||
|
weight: 2 |
||||||
|
region: content |
||||||
|
label: above |
||||||
|
settings: |
||||||
|
use_description_as_link_text: true |
||||||
|
third_party_settings: { } |
||||||
|
field_media_of: |
||||||
|
type: entity_reference_label |
||||||
|
weight: 4 |
||||||
|
region: content |
||||||
|
label: above |
||||||
|
settings: |
||||||
|
link: true |
||||||
|
third_party_settings: { } |
||||||
|
uid: |
||||||
|
label: hidden |
||||||
|
type: author |
||||||
|
weight: 0 |
||||||
|
region: content |
||||||
|
settings: { } |
||||||
|
third_party_settings: { } |
||||||
|
hidden: |
||||||
|
field_media_use: true |
||||||
|
field_mime_type: true |
||||||
|
langcode: true |
||||||
|
name: true |
||||||
|
thumbnail: true |
@ -0,0 +1,20 @@ |
|||||||
|
langcode: en |
||||||
|
status: true |
||||||
|
dependencies: |
||||||
|
config: |
||||||
|
- field.storage.media.field_edited_text |
||||||
|
- media.type.extracted_text |
||||||
|
module: |
||||||
|
- text |
||||||
|
id: media.extracted_text.field_edited_text |
||||||
|
field_name: field_edited_text |
||||||
|
entity_type: media |
||||||
|
bundle: extracted_text |
||||||
|
label: 'Edited Text' |
||||||
|
description: '' |
||||||
|
required: false |
||||||
|
translatable: true |
||||||
|
default_value: { } |
||||||
|
default_value_callback: '' |
||||||
|
settings: { } |
||||||
|
field_type: text_long |
@ -0,0 +1,26 @@ |
|||||||
|
langcode: en |
||||||
|
status: true |
||||||
|
dependencies: |
||||||
|
config: |
||||||
|
- field.storage.media.field_media_file |
||||||
|
- media.type.extracted_text |
||||||
|
module: |
||||||
|
- file |
||||||
|
id: media.extracted_text.field_media_file |
||||||
|
field_name: field_media_file |
||||||
|
entity_type: media |
||||||
|
bundle: extracted_text |
||||||
|
label: File |
||||||
|
description: '' |
||||||
|
required: true |
||||||
|
translatable: true |
||||||
|
default_value: { } |
||||||
|
default_value_callback: '' |
||||||
|
settings: |
||||||
|
file_extensions: 'txt doc docx pdf' |
||||||
|
file_directory: '[date:custom:Y]-[date:custom:m]' |
||||||
|
max_filesize: '' |
||||||
|
description_field: false |
||||||
|
handler: 'default:file' |
||||||
|
handler_settings: { } |
||||||
|
field_type: file |
@ -0,0 +1,24 @@ |
|||||||
|
langcode: en |
||||||
|
status: true |
||||||
|
dependencies: |
||||||
|
config: |
||||||
|
- field.storage.media.field_media_of |
||||||
|
- media.type.extracted_text |
||||||
|
id: media.extracted_text.field_media_of |
||||||
|
field_name: field_media_of |
||||||
|
entity_type: media |
||||||
|
bundle: extracted_text |
||||||
|
label: 'Media of' |
||||||
|
description: '' |
||||||
|
required: false |
||||||
|
translatable: true |
||||||
|
default_value: { } |
||||||
|
default_value_callback: '' |
||||||
|
settings: |
||||||
|
handler: 'default:node' |
||||||
|
handler_settings: |
||||||
|
target_bundles: null |
||||||
|
sort: |
||||||
|
field: _none |
||||||
|
auto_create: false |
||||||
|
field_type: entity_reference |
@ -0,0 +1,28 @@ |
|||||||
|
langcode: en |
||||||
|
status: true |
||||||
|
dependencies: |
||||||
|
config: |
||||||
|
- field.storage.media.field_media_use |
||||||
|
- media.type.extracted_text |
||||||
|
- taxonomy.vocabulary.islandora_media_use |
||||||
|
id: media.extracted_text.field_media_use |
||||||
|
field_name: field_media_use |
||||||
|
entity_type: media |
||||||
|
bundle: extracted_text |
||||||
|
label: 'Media Use' |
||||||
|
description: '' |
||||||
|
required: false |
||||||
|
translatable: true |
||||||
|
default_value: { } |
||||||
|
default_value_callback: '' |
||||||
|
settings: |
||||||
|
handler: 'default:taxonomy_term' |
||||||
|
handler_settings: |
||||||
|
target_bundles: |
||||||
|
islandora_media_use: islandora_media_use |
||||||
|
sort: |
||||||
|
field: name |
||||||
|
direction: asc |
||||||
|
auto_create: false |
||||||
|
auto_create_bundle: '' |
||||||
|
field_type: entity_reference |
@ -0,0 +1,20 @@ |
|||||||
|
langcode: en |
||||||
|
status: true |
||||||
|
dependencies: |
||||||
|
config: |
||||||
|
- field.storage.media.field_mime_type |
||||||
|
- media.type.extracted_text |
||||||
|
id: media.extracted_text.field_mime_type |
||||||
|
field_name: field_mime_type |
||||||
|
entity_type: media |
||||||
|
bundle: extracted_text |
||||||
|
label: 'MIME type' |
||||||
|
description: '' |
||||||
|
required: false |
||||||
|
translatable: true |
||||||
|
default_value: |
||||||
|
- |
||||||
|
value: text/plain |
||||||
|
default_value_callback: '' |
||||||
|
settings: { } |
||||||
|
field_type: string |
@ -0,0 +1,22 @@ |
|||||||
|
langcode: en |
||||||
|
status: true |
||||||
|
dependencies: |
||||||
|
module: |
||||||
|
- field_permissions |
||||||
|
- media |
||||||
|
- text |
||||||
|
third_party_settings: |
||||||
|
field_permissions: |
||||||
|
permission_type: public |
||||||
|
id: media.field_edited_text |
||||||
|
field_name: field_edited_text |
||||||
|
entity_type: media |
||||||
|
type: text_long |
||||||
|
settings: { } |
||||||
|
module: text |
||||||
|
locked: false |
||||||
|
cardinality: 1 |
||||||
|
translatable: true |
||||||
|
indexes: { } |
||||||
|
persist_with_no_fields: false |
||||||
|
custom_storage: false |
@ -0,0 +1,10 @@ |
|||||||
|
langcode: en |
||||||
|
status: true |
||||||
|
dependencies: |
||||||
|
config: |
||||||
|
- media.type.extracted_text |
||||||
|
id: media.extracted_text |
||||||
|
target_entity_type_id: media |
||||||
|
target_bundle: extracted_text |
||||||
|
default_langcode: site_default |
||||||
|
language_alterable: false |
@ -0,0 +1,12 @@ |
|||||||
|
langcode: en |
||||||
|
status: true |
||||||
|
dependencies: { } |
||||||
|
id: extracted_text |
||||||
|
label: 'Extracted Text' |
||||||
|
description: 'Text extracted from Images or PDFs' |
||||||
|
source: file |
||||||
|
queue_thumbnail_downloads: false |
||||||
|
new_revision: false |
||||||
|
source_configuration: |
||||||
|
source_field: field_media_file |
||||||
|
field_map: { } |
@ -0,0 +1,19 @@ |
|||||||
|
langcode: en |
||||||
|
status: true |
||||||
|
dependencies: |
||||||
|
module: |
||||||
|
- islandora_text_extraction |
||||||
|
id: get_ocr_from_image |
||||||
|
label: 'Extract Text from Image or PDF' |
||||||
|
type: node |
||||||
|
plugin: generate_ocr_derivative |
||||||
|
configuration: |
||||||
|
queue: islandora-connector-ocr |
||||||
|
event: 'Generate Derivative' |
||||||
|
source_term_uri: 'http://pcdm.org/use#OriginalFile' |
||||||
|
derivative_term_uri: 'http://pcdm.org/use#ExtractedText' |
||||||
|
mimetype: text/plain |
||||||
|
args: null |
||||||
|
destination_media_type: extracted_text |
||||||
|
scheme: public |
||||||
|
path: '[date:custom:Y]-[date:custom:m]/[node:nid]-[term:name].txt' |
@ -0,0 +1,2 @@ |
|||||||
|
bundle: islandora |
||||||
|
required: true |
@ -0,0 +1,17 @@ |
|||||||
|
name: 'Islandora Text Extraction Defaults' |
||||||
|
type: module |
||||||
|
description: 'Default config for the Islandora Text Extraction module.' |
||||||
|
core: 8.x |
||||||
|
package: Islandora |
||||||
|
dependencies: |
||||||
|
- field |
||||||
|
- field_permissions |
||||||
|
- file |
||||||
|
- islandora_core_feature |
||||||
|
- islandora_text_extraction |
||||||
|
- language |
||||||
|
- media |
||||||
|
- path |
||||||
|
- system |
||||||
|
- text |
||||||
|
- user |
Loading…
Reference in new issue