classify-recovered-archives/salvage-damaged-zips.zsh
Reindl David (IT-PTR-CEN2-SL10) 94bfc77c11 intial state
Co-authored-by: Copilot <copilot@github.com>
2026-05-02 17:06:52 +02:00

299 lines
7.2 KiB
Bash
Executable File

#!/bin/zsh
emulate -L zsh
setopt extended_glob no_nomatch no_unset pipefail
typeset -gr SCRIPT_NAME=${0:t}
typeset -g INPUT_ROOT=""
typeset -g OUTPUT_ROOT=""
typeset -g DRY_RUN=0
typeset -g VERBOSE=0
usage() {
cat <<'EOF'
Usage: salvage-damaged-zips.zsh [options] DIRECTORY
Attempt repair and partial extraction for damaged ZIP files under DIRECTORY.
Options:
-n, --dry-run Print planned actions without writing repaired files.
-v, --verbose Print extra diagnostics while processing.
-o, --output DIR Write results into DIR. Defaults to DIRECTORY.salvaged.
-h, --help Show this help text.
EOF
}
log() {
print -r -- "$*"
}
verbose() {
if (( VERBOSE )); then
print -r -- "$*"
fi
}
die() {
print -u2 -r -- "error: $*"
exit 1
}
ensure_tools() {
local tool
for tool in zip unzip bsdtar file strings perl mktemp find; do
command -v "$tool" >/dev/null 2>&1 || die "required tool not found: $tool"
done
}
trim_value() {
local value="$1"
value="${value//$'\r'/ }"
value="${value//$'\n'/ }"
value="${value//$'\t'/ }"
value=${value##[[:space:]]##}
value=${value%%[[:space:]]##}
print -r -- "$value"
}
sanitize_name() {
local value="$1"
value=$(trim_value "$value")
value=${value//$'\0'/}
value=${value//\//-}
value=${value//:/-}
value=${value//\\/-}
value=$(print -r -- "$value" | tr -s ' ')
value=${value##.##}
value=${value%%[[:space:]]##}
value=${value##[[:space:]]##}
if [[ -z "$value" ]]; then
value="Untitled"
fi
print -r -- "$value"
}
parse_args() {
local arg
while (( $# )); do
arg=$1
case "$arg" in
-n|--dry-run)
DRY_RUN=1
;;
-v|--verbose)
VERBOSE=1
;;
-o|--output)
shift
(( $# )) || die "missing argument for --output"
OUTPUT_ROOT=$1
;;
-h|--help)
usage
exit 0
;;
--)
shift
break
;;
-*)
die "unknown option: $arg"
;;
*)
if [[ -n "$INPUT_ROOT" ]]; then
die "only one directory may be provided"
fi
INPUT_ROOT=$arg
;;
esac
shift
done
[[ -n "$INPUT_ROOT" ]] || {
usage
exit 1
}
[[ -d "$INPUT_ROOT" ]] || die "directory does not exist: $INPUT_ROOT"
INPUT_ROOT=${INPUT_ROOT:A}
if [[ -z "$OUTPUT_ROOT" ]]; then
OUTPUT_ROOT="${INPUT_ROOT}.salvaged"
fi
OUTPUT_ROOT=${OUTPUT_ROOT:A}
}
collect_archives() {
find "$INPUT_ROOT" -type f -iname '*.zip' -print | sort
}
archive_markers() {
strings -a "$1" 2>/dev/null
}
text_has_marker() {
local text="$1"
local pattern="$2"
print -r -- "$text" | grep -E -q -- "$pattern"
}
archive_has_binary_marker() {
local archive="$1"
local pattern="$2"
LC_ALL=C grep -aE -q -- "$pattern" "$archive"
}
classify_marker_family() {
local archive="$1"
if archive_has_binary_marker "$archive" 'Index/Tables/'; then
print -r -- "Damaged-Numbers"
return 0
fi
if archive_has_binary_marker "$archive" 'Metadata/DocumentProperties\.plist|Pages/'; then
print -r -- "Damaged-Pages"
return 0
fi
if archive_has_binary_marker "$archive" 'Index/Document\.iwa' && archive_has_binary_marker "$archive" 'Index/CalculationEngine'; then
print -r -- "Damaged-Apple-iWork"
return 0
fi
print -r -- "Damaged-Zip"
}
escape_md_cell() {
local value="$1"
value=${value//|/\\|}
print -r -- "$value"
}
repair_archive() {
local source_archive="$1"
local repaired_archive="$2"
zip -FF "$source_archive" --out "$repaired_archive" <<'EOF' >/dev/null 2>"${repaired_archive}.repair.log"
y
EOF
}
extract_repaired_archive() {
local repaired_archive="$1"
local extract_dir="$2"
mkdir -p -- "$extract_dir" || return 1
bsdtar -xf "$repaired_archive" -C "$extract_dir" 2>"${extract_dir}.extract.log"
}
write_report_header() {
local markdown_report="$1"
local tsv_report="$2"
cat > "$markdown_report" <<EOF
# Damaged ZIP Salvage Report
Source: $INPUT_ROOT
Output: $OUTPUT_ROOT
| Archive | Likely family | Repaired entries | Extracted visible assets | Notes |
| --- | --- | ---: | ---: | --- |
EOF
print -r -- $'archive\tfamily\trepaired_entries\tvisible_assets\tnotes' > "$tsv_report"
}
append_report_row() {
local markdown_report="$1"
local tsv_report="$2"
local archive_label="$3"
local family="$4"
local repaired_entries="$5"
local visible_assets="$6"
local notes="$7"
print -r -- "| $(escape_md_cell "$archive_label") | $(escape_md_cell "$family") | $repaired_entries | $visible_assets | $(escape_md_cell "$notes") |" >> "$markdown_report"
print -r -- "$archive_label\t$family\t$repaired_entries\t$visible_assets\t$notes" >> "$tsv_report"
}
main() {
local -a archives
local archive=""
local source_name=""
local base_name=""
local family=""
local family_dir=""
local repaired_archive=""
local extract_dir=""
local repaired_listing=""
local repaired_entries=0
local visible_assets=0
local notes=""
local markdown_report=""
local tsv_report=""
parse_args "$@"
ensure_tools
archives=(${(f)"$(collect_archives)"})
if (( ${#archives} == 0 )); then
log "No .zip files found under $INPUT_ROOT"
return 0
fi
if (( DRY_RUN )); then
for archive in $archives; do
family=$(classify_marker_family "$archive")
log "DRY-RUN $archive => $family"
done
return 0
fi
mkdir -p -- "$OUTPUT_ROOT/repaired" "$OUTPUT_ROOT/extracted" "$OUTPUT_ROOT/logs" || die "failed to create output directories"
markdown_report="$OUTPUT_ROOT/salvage-report.md"
tsv_report="$OUTPUT_ROOT/salvage-report.tsv"
write_report_header "$markdown_report" "$tsv_report"
for archive in $archives; do
source_name=${archive:t}
base_name=$(sanitize_name "${source_name:r}")
family=$(classify_marker_family "$archive")
family_dir="$OUTPUT_ROOT/extracted/$family/$base_name"
repaired_archive="$OUTPUT_ROOT/repaired/${base_name}.repaired.zip"
extract_dir="$family_dir"
notes=""
repaired_entries=0
visible_assets=0
verbose "repairing $archive => $family"
if ! repair_archive "$archive" "$repaired_archive"; then
notes="zip -FF could not rebuild a readable archive"
append_report_row "$markdown_report" "$tsv_report" "$source_name" "$family" "$repaired_entries" "$visible_assets" "$notes"
continue
fi
repaired_listing=$(unzip -Z1 "$repaired_archive" 2>/dev/null)
if [[ -n "$repaired_listing" ]]; then
repaired_entries=$(print -r -- "$repaired_listing" | sed '/^$/d' | wc -l | tr -d ' ')
fi
extract_repaired_archive "$repaired_archive" "$extract_dir" || true
visible_assets=$(find "$extract_dir" -type f \( -iname '*.jpg' -o -iname '*.jpeg' -o -iname '*.png' -o -iname '*.tiff' -o -iname '*.tif' -o -iname '*.pdf' -o -iname '*.heic' \) 2>/dev/null | wc -l | tr -d ' ')
if (( visible_assets > 0 )); then
notes="visible embedded assets recovered"
elif (( repaired_entries > 0 )); then
notes="internal iWork entries recovered"
else
notes="repair succeeded but no entries were listed"
fi
append_report_row "$markdown_report" "$tsv_report" "$source_name" "$family" "$repaired_entries" "$visible_assets" "$notes"
done
log "Wrote salvage output to $OUTPUT_ROOT"
log "Report: $markdown_report"
}
main "$@"