1 changed files with 112 additions and 78 deletions
@ -1,78 +1,112 @@ |
|||||||
# cli commands |
# CLI Cheatsheet |
||||||
#cli |
|
||||||
|
## Contents |
||||||
## display list of content types and # of associated nodes: |
- [Drupal / Drush](#drupal--drush) |
||||||
drush sqlq 'select count(node.nid) as node_count, node_type.type from node inner join node_type on node.type = node_type.type group by node_type.type' |
- [Text Processing](#text-processing) |
||||||
|
- [Torrents](#torrents) |
||||||
## And then if you want filter by a specific type, just use grep like this: |
- [PDF Tools](#pdf-tools) |
||||||
drush sqlq 'select count(node.nid) as node_count, node_type.type from node inner join node_type on node.type = node_type.type group by node_type.type' | grep 2014 |
|
||||||
|
--- |
||||||
## search replace in text mulitle files |
|
||||||
perl -pi -w -e 's/SEARCH_FOR/REPLACE_WITH/g;' *.txt |
## Drupal / Drush |
||||||
perl -pi -w -e 's/thex/robertsonlibrary/g;' **/*.* |
|
||||||
|
> Note: These SQL queries target the Drupal 7 schema (`node_type` table). They won't work as-is on Drupal 8+. |
||||||
## search replace in file names |
|
||||||
rename 's/livero/lives/g' **/*.* -v |
### List content types with node counts |
||||||
|
|
||||||
## torrent download |
drush sqlq 'select count(node.nid) as node_count, node_type.type |
||||||
aria2c -d ~/Downloads "magnetlink" |
from node |
||||||
|
inner join node_type on node.type = node_type.type |
||||||
|
group by node_type.type' |
||||||
ocrmypdf --optimize 3 --skip-text input.pdf output.pdf |
|
||||||
|
### Filter results by keyword (e.g. "2014") |
||||||
|
|
||||||
ocrmypdf --optimize 3 --image-dpi 300 --output-type pdf \ |
drush sqlq 'select count(node.nid) as node_count, node_type.type |
||||||
--force-ocr --tesseract-pagesegmode 1 input.pdf output.pdf |
from node |
||||||
|
inner join node_type on node.type = node_type.type |
||||||
# down sample pdfs to 72dpi |
group by node_type.type' | grep 2014 |
||||||
(single file) |
|
||||||
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 \ |
--- |
||||||
-dPDFSETTINGS=/screen \ |
|
||||||
-dNOPAUSE -dQUIET -dBATCH \ |
## Text Processing |
||||||
-sOutputFile=output.pdf input.pdf |
|
||||||
|
### Find and replace inside files (perl) |
||||||
(batch) |
|
||||||
|
# Single file type in current directory |
||||||
# In the folder with your PDFs |
perl -pi -w -e 's/SEARCH_FOR/REPLACE_WITH/g;' *.txt |
||||||
mkdir downsampled |
|
||||||
|
# Recursively across all file types |
||||||
for f in *.pdf *.PDF; do |
perl -pi -w -e 's/thex/robertsonlibrary/g;' **/*.* |
||||||
[ -f "$f" ] || continue |
|
||||||
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/screen \ |
### Find and replace in file names |
||||||
-dNOPAUSE -dBATCH \ |
|
||||||
-sOutputFile="downsampled/${f%.pdf}_72dpi.pdf" \ |
# Rename files matching a pattern, verbose output shows what changed |
||||||
"$f" |
rename 's/livero/lives/g' **/*.* -v |
||||||
done |
|
||||||
# In the folder that contains your original PDFs |
--- |
||||||
|
|
||||||
mkdir -p downsampled |
## Torrents |
||||||
|
|
||||||
for f in *.pdf *.PDF; do |
### Download via magnet link (aria2c) |
||||||
[ -f "$f" ] || continue |
|
||||||
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/screen \ |
# aria2c is a lightweight multi-protocol download utility |
||||||
-dNOPAUSE -dBATCH -dQUIET \ |
aria2c -d ~/Downloads "magnetlink" |
||||||
-sOutputFile="downsampled/$f" \ |
|
||||||
"$f" |
--- |
||||||
done |
|
||||||
|
## PDF Tools |
||||||
# check current dpi |
|
||||||
for f in *.pdf *.PDF; do |
### OCR a PDF (ocrmypdf) |
||||||
echo "=== Images in: $f ===" |
|
||||||
pdfimages -list "$f" |
# Standard: optimize output, skip pages that already have a text layer |
||||||
echo "" |
ocrmypdf --optimize 3 --skip-text input.pdf output.pdf |
||||||
done |
|
||||||
# Creates (or overwrites) images_list.txt in the current directory |
# Aggressive: force re-OCR even if a text layer exists (useful for corrupt/bad layers), |
||||||
for f in *.pdf *.PDF; do |
# set DPI manually, use page segmentation mode 1 (automatic with OSD) |
||||||
if [ -f "$f" ]; then |
ocrmypdf --optimize 3 --image-dpi 300 --output-type pdf \ |
||||||
echo "=== Images in: $f ===" >> images_list.txt |
--force-ocr --tesseract-pagesegmode 1 input.pdf output.pdf |
||||||
pdfimages -list "$f" >> images_list.txt |
|
||||||
echo "" >> images_list.txt |
### Downsample a PDF to 72dpi (Ghostscript) |
||||||
fi |
|
||||||
done |
# Single file |
||||||
# scan for ccitt encoding |
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 \ |
||||||
for f in *.pdf *.PDF; do |
-dPDFSETTINGS=/screen \ |
||||||
[ -f "$f" ] || continue |
-dNOPAUSE -dQUIET -dBATCH \ |
||||||
if pdfimages -list "$f" 2>/dev/null | grep -q " ccitt "; then |
-sOutputFile=output.pdf input.pdf |
||||||
echo "$f uses CCITT" |
|
||||||
fi |
# Batch - processes all PDFs in current folder, preserves original filenames |
||||||
done |
mkdir -p downsampled |
||||||
|
for f in *.pdf *.PDF; do |
||||||
|
[ -f "$f" ] || continue |
||||||
|
gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 -dPDFSETTINGS=/screen \ |
||||||
|
-dNOPAUSE -dBATCH -dQUIET \ |
||||||
|
-sOutputFile="downsampled/$f" \ |
||||||
|
"$f" |
||||||
|
done |
||||||
|
|
||||||
|
### Check image DPI in PDFs (pdfimages) |
||||||
|
|
||||||
|
# Print image info to terminal |
||||||
|
for f in *.pdf *.PDF; do |
||||||
|
echo "=== $f ===" |
||||||
|
pdfimages -list "$f" |
||||||
|
echo "" |
||||||
|
done |
||||||
|
|
||||||
|
# Save output to images_list.txt instead |
||||||
|
for f in *.pdf *.PDF; do |
||||||
|
[ -f "$f" ] || continue |
||||||
|
echo "=== $f ===" >> images_list.txt |
||||||
|
pdfimages -list "$f" >> images_list.txt |
||||||
|
echo "" >> images_list.txt |
||||||
|
done |
||||||
|
|
||||||
|
### Scan for CCITT encoding |
||||||
|
|
||||||
|
# CCITT is a fax-era compression format - flags PDFs that may cause compatibility issues |
||||||
|
for f in *.pdf *.PDF; do |
||||||
|
[ -f "$f" ] || continue |
||||||
|
if pdfimages -list "$f" 2>/dev/null | grep -q " ccitt "; then |
||||||
|
echo "$f uses CCITT" |
||||||
|
fi |
||||||
|
done |
||||||
|
|||||||
Loading…
Reference in new issue