Browse Source

[nb] Edit: cli_commands.md

master
rdrew 2 weeks ago
parent
commit
b057b0b94e
  1. 190
      cli_commands.md

190
cli_commands.md

@ -1,78 +1,112 @@
# cli commands # CLI Cheatsheet
#cli
## Contents
## display list of content types and # of associated nodes: - [Drupal / Drush](#drupal--drush)
drush sqlq 'select count(node.nid) as node_count, node_type.type from node inner join node_type on node.type = node_type.type group by node_type.type' - [Text Processing](#text-processing)
- [Torrents](#torrents)
## And then if you want filter by a specific type, just use grep like this: - [PDF Tools](#pdf-tools)
drush sqlq 'select count(node.nid) as node_count, node_type.type from node inner join node_type on node.type = node_type.type group by node_type.type' | grep 2014
---
## search replace in text mulitle files
perl -pi -w -e 's/SEARCH_FOR/REPLACE_WITH/g;' *.txt ## Drupal / Drush
perl -pi -w -e 's/thex/robertsonlibrary/g;' **/*.*
> Note: These SQL queries target the Drupal 7 schema (`node_type` table). They won't work as-is on Drupal 8+.
## search replace in file names
rename 's/livero/lives/g' **/*.* -v ### List content types with node counts
## torrent download drush sqlq 'select count(node.nid) as node_count, node_type.type
aria2c -d ~/Downloads "magnetlink" from node
inner join node_type on node.type = node_type.type
group by node_type.type'
ocrmypdf --optimize 3 --skip-text input.pdf output.pdf
### Filter results by keyword (e.g. "2014")
ocrmypdf --optimize 3 --image-dpi 300 --output-type pdf \ drush sqlq 'select count(node.nid) as node_count, node_type.type
--force-ocr --tesseract-pagesegmode 1 input.pdf output.pdf from node
inner join node_type on node.type = node_type.type
# down sample pdfs to 72dpi group by node_type.type' | grep 2014
(single file)
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 \ ---
-dPDFSETTINGS=/screen \
-dNOPAUSE -dQUIET -dBATCH \ ## Text Processing
-sOutputFile=output.pdf input.pdf
### Find and replace inside files (perl)
(batch)
# Single file type in current directory
# In the folder with your PDFs perl -pi -w -e 's/SEARCH_FOR/REPLACE_WITH/g;' *.txt
mkdir downsampled
# Recursively across all file types
for f in *.pdf *.PDF; do perl -pi -w -e 's/thex/robertsonlibrary/g;' **/*.*
[ -f "$f" ] || continue
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/screen \ ### Find and replace in file names
-dNOPAUSE -dBATCH \
-sOutputFile="downsampled/${f%.pdf}_72dpi.pdf" \ # Rename files matching a pattern, verbose output shows what changed
"$f" rename 's/livero/lives/g' **/*.* -v
done
# In the folder that contains your original PDFs ---
mkdir -p downsampled ## Torrents
for f in *.pdf *.PDF; do ### Download via magnet link (aria2c)
[ -f "$f" ] || continue
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/screen \ # aria2c is a lightweight multi-protocol download utility
-dNOPAUSE -dBATCH -dQUIET \ aria2c -d ~/Downloads "magnetlink"
-sOutputFile="downsampled/$f" \
"$f" ---
done
## PDF Tools
# check current dpi
for f in *.pdf *.PDF; do ### OCR a PDF (ocrmypdf)
echo "=== Images in: $f ==="
pdfimages -list "$f" # Standard: optimize output, skip pages that already have a text layer
echo "" ocrmypdf --optimize 3 --skip-text input.pdf output.pdf
done
# Creates (or overwrites) images_list.txt in the current directory # Aggressive: force re-OCR even if a text layer exists (useful for corrupt/bad layers),
for f in *.pdf *.PDF; do # set DPI manually, use page segmentation mode 1 (automatic with OSD)
if [ -f "$f" ]; then ocrmypdf --optimize 3 --image-dpi 300 --output-type pdf \
echo "=== Images in: $f ===" >> images_list.txt --force-ocr --tesseract-pagesegmode 1 input.pdf output.pdf
pdfimages -list "$f" >> images_list.txt
echo "" >> images_list.txt ### Downsample a PDF to 72dpi (Ghostscript)
fi
done # Single file
# scan for ccitt encoding gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 \
for f in *.pdf *.PDF; do -dPDFSETTINGS=/screen \
[ -f "$f" ] || continue -dNOPAUSE -dQUIET -dBATCH \
if pdfimages -list "$f" 2>/dev/null | grep -q " ccitt "; then -sOutputFile=output.pdf input.pdf
echo "$f uses CCITT"
fi # Batch - processes all PDFs in current folder, preserves original filenames
done mkdir -p downsampled
for f in *.pdf *.PDF; do
[ -f "$f" ] || continue
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/screen \
-dNOPAUSE -dBATCH -dQUIET \
-sOutputFile="downsampled/$f" \
"$f"
done
### Check image DPI in PDFs (pdfimages)
# Print image info to terminal
for f in *.pdf *.PDF; do
echo "=== $f ==="
pdfimages -list "$f"
echo ""
done
# Save output to images_list.txt instead
for f in *.pdf *.PDF; do
[ -f "$f" ] || continue
echo "=== $f ===" >> images_list.txt
pdfimages -list "$f" >> images_list.txt
echo "" >> images_list.txt
done
### Scan for CCITT encoding
# CCITT is a fax-era compression format - flags PDFs that may cause compatibility issues
for f in *.pdf *.PDF; do
[ -f "$f" ] || continue
if pdfimages -list "$f" 2>/dev/null | grep -q " ccitt "; then
echo "$f uses CCITT"
fi
done

Loading…
Cancel
Save