[nb] Edit: cli_commands.md

4 months ago · b057b0b94e
1 changed files with 112 additions and 78 deletions
--- a/cli_commands.md
+++ b/cli_commands.md
@ -1,78 +1,112 @@
-# cli commands
-#cli
+# CLI Cheatsheet

-## display list of content types and # of associated nodes:
-drush sqlq 'select count(node.nid) as node_count, node_type.type from node inner join node_type on node.type = node_type.type group by node_type.type'
+## Contents
+- [Drupal / Drush](#drupal--drush)
+- [Text Processing](#text-processing)
+- [Torrents](#torrents)
+- [PDF Tools](#pdf-tools)

-## And then if you want filter by a specific type, just use grep like this:
-drush sqlq 'select count(node.nid) as node_count, node_type.type from node inner join node_type on node.type = node_type.type group by node_type.type' | grep 2014
+---

-## search replace in text mulitle files
-perl -pi -w -e 's/SEARCH_FOR/REPLACE_WITH/g;' *.txt
-perl -pi -w -e 's/thex/robertsonlibrary/g;' **/*.*
+## Drupal / Drush

-## search replace in file names
-rename 's/livero/lives/g' **/*.* -v
+> Note: These SQL queries target the Drupal 7 schema (`node_type` table). They won't work as-is on Drupal 8+.

-## torrent download
-aria2c -d ~/Downloads "magnetlink" 
+### List content types with node counts

+    drush sqlq 'select count(node.nid) as node_count, node_type.type
+                from node
+                inner join node_type on node.type = node_type.type
+                group by node_type.type'

-ocrmypdf --optimize 3 --skip-text input.pdf output.pdf
+### Filter results by keyword (e.g. "2014")

+    drush sqlq 'select count(node.nid) as node_count, node_type.type
+                from node
+                inner join node_type on node.type = node_type.type
+                group by node_type.type' | grep 2014

-ocrmypdf --optimize 3 --image-dpi 300 --output-type pdf \
-         --force-ocr --tesseract-pagesegmode 1 input.pdf output.pdf
+---

-# down sample pdfs to 72dpi
-(single file)
-gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 \
-   -dPDFSETTINGS=/screen \
-   -dNOPAUSE -dQUIET -dBATCH \
-   -sOutputFile=output.pdf input.pdf
+## Text Processing

-(batch)
+### Find and replace inside files (perl)

-# In the folder with your PDFs
-mkdir downsampled
+    # Single file type in current directory
+    perl -pi -w -e 's/SEARCH_FOR/REPLACE_WITH/g;' *.txt

-for f in *.pdf *.PDF; do
-    [ -f "$f" ] || continue
-    gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/screen \
-       -dNOPAUSE -dBATCH \
-       -sOutputFile="downsampled/${f%.pdf}_72dpi.pdf" \
-       "$f"
-done
-# In the folder that contains your original PDFs
+    # Recursively across all file types
+    perl -pi -w -e 's/thex/robertsonlibrary/g;' **/*.*
+
+### Find and replace in file names
+
+    # Rename files matching a pattern, verbose output shows what changed
+    rename 's/livero/lives/g' **/*.* -v
+
+---
+
+## Torrents
+
+### Download via magnet link (aria2c)
+
+    # aria2c is a lightweight multi-protocol download utility
+    aria2c -d ~/Downloads "magnetlink"
+
+---
+
+## PDF Tools

-mkdir -p downsampled
+### OCR a PDF (ocrmypdf)

-for f in *.pdf *.PDF; do
+    # Standard: optimize output, skip pages that already have a text layer
+    ocrmypdf --optimize 3 --skip-text input.pdf output.pdf
+
+    # Aggressive: force re-OCR even if a text layer exists (useful for corrupt/bad layers),
+    # set DPI manually, use page segmentation mode 1 (automatic with OSD)
+    ocrmypdf --optimize 3 --image-dpi 300 --output-type pdf \
+             --force-ocr --tesseract-pagesegmode 1 input.pdf output.pdf
+
+### Downsample a PDF to 72dpi (Ghostscript)
+
+    # Single file
+    gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 \
+       -dPDFSETTINGS=/screen \
+       -dNOPAUSE -dQUIET -dBATCH \
+       -sOutputFile=output.pdf input.pdf
+
+    # Batch - processes all PDFs in current folder, preserves original filenames
+    mkdir -p downsampled
+    for f in *.pdf *.PDF; do
        [ -f "$f" ] || continue
        gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/screen \
           -dNOPAUSE -dBATCH -dQUIET \
           -sOutputFile="downsampled/$f" \
           "$f"
-done
+    done
+
+### Check image DPI in PDFs (pdfimages)

-# check current dpi
-for f in *.pdf *.PDF; do
-    echo "=== Images in: $f ==="
+    # Print image info to terminal
+    for f in *.pdf *.PDF; do
+        echo "=== $f ==="
        pdfimages -list "$f"
        echo ""
-done
-# Creates (or overwrites) images_list.txt in the current directory
-for f in *.pdf *.PDF; do
-    if [ -f "$f" ]; then
-        echo "=== Images in: $f ===" >> images_list.txt
+    done
+
+    # Save output to images_list.txt instead
+    for f in *.pdf *.PDF; do
+        [ -f "$f" ] || continue
+        echo "=== $f ===" >> images_list.txt
        pdfimages -list "$f" >> images_list.txt
        echo "" >> images_list.txt
-    fi
-done
-# scan for ccitt encoding
-for f in *.pdf *.PDF; do
+    done
+
+### Scan for CCITT encoding
+
+    # CCITT is a fax-era compression format - flags PDFs that may cause compatibility issues
+    for f in *.pdf *.PDF; do
        [ -f "$f" ] || continue
        if pdfimages -list "$f" 2>/dev/null | grep -q " ccitt "; then
            echo "$f uses CCITT"
        fi
-done
+    done