From dfdfe1adc6b07392ef4ae9357dfec5fe77d0288a Mon Sep 17 00:00:00 2001
From: Alexander O'Neill <alexander@born-digital.com>
Date: Wed, 3 May 2023 11:23:34 -0300
Subject: [PATCH] Don't add <br /> tags to edited OCR text field if it looks
 like hOCR.

---
 .../islandora_text_extraction.module                          | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modules/islandora_text_extraction/islandora_text_extraction.module b/modules/islandora_text_extraction/islandora_text_extraction.module
index 5d6f6437..9bff85f0 100644
--- a/modules/islandora_text_extraction/islandora_text_extraction.module
+++ b/modules/islandora_text_extraction/islandora_text_extraction.module
@@ -40,6 +40,10 @@ function islandora_text_extraction_media_presave(MediaInterface $media) {
       $file = File::load($file_id);
       if ($file) {
         $data = file_get_contents($file->getFileUri());
+        // Check if it's already markup like hOCR
+        if (substr($data, 0, 4) == '<xml') {
+          return;
+        }
         $data = nl2br($data);
         $media->set('field_edited_text', $data);
         $media->field_edited_text->format = 'basic_html';