[nb] Edit: cli_commands.md

4 months ago · b057b0b94e
1 changed files with 112 additions and 78 deletions
--- a/cli_commands.md
+++ b/cli_commands.md
@ -1,78 +1,112 @@
-# cli commands
+# CLI Cheatsheet
-#cli
+
-
+## Contents
-## display list of content types and # of associated nodes:
+- [Drupal / Drush](#drupal--drush)
-drush sqlq 'select count(node.nid) as node_count, node_type.type from node inner join node_type on node.type = node_type.type group by node_type.type'
+- [Text Processing](#text-processing)
-
+- [Torrents](#torrents)
-## And then if you want filter by a specific type, just use grep like this:
+- [PDF Tools](#pdf-tools)
-drush sqlq 'select count(node.nid) as node_count, node_type.type from node inner join node_type on node.type = node_type.type group by node_type.type' | grep 2014
+
-
+---
-## search replace in text mulitle files
+
-perl -pi -w -e 's/SEARCH_FOR/REPLACE_WITH/g;' *.txt
+## Drupal / Drush
-perl -pi -w -e 's/thex/robertsonlibrary/g;' **/*.*
+
-
+> Note: These SQL queries target the Drupal 7 schema (`node_type` table). They won't work as-is on Drupal 8+.
-## search replace in file names
+
-rename 's/livero/lives/g' **/*.* -v
+### List content types with node counts
-
+
-## torrent download
+    drush sqlq 'select count(node.nid) as node_count, node_type.type
-aria2c -d ~/Downloads "magnetlink" 
+                from node
-
+                inner join node_type on node.type = node_type.type
-
+                group by node_type.type'
-ocrmypdf --optimize 3 --skip-text input.pdf output.pdf
+
-
+### Filter results by keyword (e.g. "2014")
-
+
-ocrmypdf --optimize 3 --image-dpi 300 --output-type pdf \
+    drush sqlq 'select count(node.nid) as node_count, node_type.type
-         --force-ocr --tesseract-pagesegmode 1 input.pdf output.pdf
+                from node
-
+                inner join node_type on node.type = node_type.type
-# down sample pdfs to 72dpi
+                group by node_type.type' | grep 2014
-(single file)
+
-gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 \
+---
-   -dPDFSETTINGS=/screen \
+
-   -dNOPAUSE -dQUIET -dBATCH \
+## Text Processing
-   -sOutputFile=output.pdf input.pdf
+
-
+### Find and replace inside files (perl)
-(batch)
+
-
+    # Single file type in current directory
-# In the folder with your PDFs
+    perl -pi -w -e 's/SEARCH_FOR/REPLACE_WITH/g;' *.txt
-mkdir downsampled
+
-
+    # Recursively across all file types
-for f in *.pdf *.PDF; do
+    perl -pi -w -e 's/thex/robertsonlibrary/g;' **/*.*
-    [ -f "$f" ] || continue
+
-    gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/screen \
+### Find and replace in file names
-       -dNOPAUSE -dBATCH \
+
-       -sOutputFile="downsampled/${f%.pdf}_72dpi.pdf" \
+    # Rename files matching a pattern, verbose output shows what changed
-       "$f"
+    rename 's/livero/lives/g' **/*.* -v
-done
+
-# In the folder that contains your original PDFs
+---
-
+
-mkdir -p downsampled
+## Torrents
-
+
-for f in *.pdf *.PDF; do
+### Download via magnet link (aria2c)
-    [ -f "$f" ] || continue
+
-    gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/screen \
+    # aria2c is a lightweight multi-protocol download utility
-       -dNOPAUSE -dBATCH -dQUIET \
+    aria2c -d ~/Downloads "magnetlink"
-       -sOutputFile="downsampled/$f" \
+
-       "$f"
+---
-done
+
-
+## PDF Tools
-# check current dpi
+
-for f in *.pdf *.PDF; do
+### OCR a PDF (ocrmypdf)
-    echo "=== Images in: $f ==="
+
-    pdfimages -list "$f"
+    # Standard: optimize output, skip pages that already have a text layer
-    echo ""
+    ocrmypdf --optimize 3 --skip-text input.pdf output.pdf
-done
+
-# Creates (or overwrites) images_list.txt in the current directory
+    # Aggressive: force re-OCR even if a text layer exists (useful for corrupt/bad layers),
-for f in *.pdf *.PDF; do
+    # set DPI manually, use page segmentation mode 1 (automatic with OSD)
-    if [ -f "$f" ]; then
+    ocrmypdf --optimize 3 --image-dpi 300 --output-type pdf \
-        echo "=== Images in: $f ===" >> images_list.txt
+             --force-ocr --tesseract-pagesegmode 1 input.pdf output.pdf
-        pdfimages -list "$f"       >> images_list.txt
+
-        echo ""                    >> images_list.txt
+### Downsample a PDF to 72dpi (Ghostscript)
-    fi
+
-done
+    # Single file
-# scan for ccitt encoding
+    gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 \
-for f in *.pdf *.PDF; do
+       -dPDFSETTINGS=/screen \
-    [ -f "$f" ] || continue
+       -dNOPAUSE -dQUIET -dBATCH \
-    if pdfimages -list "$f" 2>/dev/null | grep -q " ccitt "; then
+       -sOutputFile=output.pdf input.pdf
-        echo "$f uses CCITT"
+
-    fi
+    # Batch - processes all PDFs in current folder, preserves original filenames
-done
+    mkdir -p downsampled
    for f in *.pdf *.PDF; do
        [ -f "$f" ] || continue
        gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/screen \
           -dNOPAUSE -dBATCH -dQUIET \
           -sOutputFile="downsampled/$f" \
           "$f"
    done
 ### Check image DPI in PDFs (pdfimages)
    # Print image info to terminal
    for f in *.pdf *.PDF; do
        echo "=== $f ==="
        pdfimages -list "$f"
        echo ""
    done
    # Save output to images_list.txt instead
    for f in *.pdf *.PDF; do
        [ -f "$f" ] || continue
        echo "=== $f ===" >> images_list.txt
        pdfimages -list "$f" >> images_list.txt
        echo "" >> images_list.txt
    done
 ### Scan for CCITT encoding
    # CCITT is a fax-era compression format - flags PDFs that may cause compatibility issues
    for f in *.pdf *.PDF; do
        [ -f "$f" ] || continue
        if pdfimages -list "$f" 2>/dev/null | grep -q " ccitt "; then
            echo "$f uses CCITT"
        fi
    done