#!/bin/zsh emulate -L zsh setopt extended_glob no_nomatch no_unset pipefail typeset -gr SCRIPT_NAME=${0:t} typeset -g SCAN_ROOT="" typeset -g DRY_RUN=0 typeset -g VERBOSE=0 typeset -g OVERWRITE=0 typeset -g SALVAGE_DAMAGED=0 typeset -g SALVAGE_SCRIPT_PATH="${0:A:h}/salvage-damaged-zips.zsh" typeset -g TMP_ROOT="" typeset -ga MANAGED_DIRS=( Apple-iWork Damaged-Apple-iWork Damaged-Numbers Damaged-Pages Damaged-Zip Numbers Pages Word Excel PowerPoint OpenDocument-Text OpenDocument-Sheet OpenDocument-Presentation EPUB PDF Text HTML XML RichText Image JSON Jar APK Tar Unknown ) typeset -gi PROCESSED_COUNT=0 typeset -gi CLASSIFIED_COUNT=0 typeset -gi UNKNOWN_COUNT=0 typeset -gi RENAMED_COUNT=0 typeset -gi SKIPPED_COUNT=0 typeset -gi FAILED_COUNT=0 typeset -g ACTION_LABEL="Renamed" typeset -g DETECTED_GROUP="Unknown" typeset -g DETECTED_SUFFIX="" typeset -g DETECTED_BASENAME="" typeset -g DETECTED_CONFIDENCE="low" typeset -g DETECTED_REASON="" typeset -ga IWORK_AMBIGUOUS_SUFFIXES=(pages numbers key) usage() { cat <<'EOF' Usage: classify-recovered-archives.zsh [options] DIRECTORY Scan DIRECTORY for recovered .zip and .gz files, infer the original document type, rename them with the proper suffix, and move them into labeled subdirectories. Options: -n, --dry-run Print planned actions without modifying files. -v, --verbose Print extra diagnostics while scanning. --overwrite Allow overwriting an existing destination file. --salvage-damaged After classification, run salvage-damaged-zips.zsh on any damaged ZIP output folders under the scan root. -h, --help Show this help text. EOF } log() { print -r -- "$*" } verbose() { if (( VERBOSE )); then print -r -- "$*" fi } warn() { print -u2 -r -- "warning: $*" } die() { print -u2 -r -- "error: $*" exit 1 } cleanup() { if [[ -n ${TMP_ROOT:-} && -d ${TMP_ROOT:-} ]]; then rm -rf -- "$TMP_ROOT" fi } trap cleanup EXIT INT TERM ensure_tools() { local tool for tool in file unzip gzip tar plutil perl mktemp find strings; do command -v "$tool" >/dev/null 2>&1 || die "required tool not found: $tool" done if ! command -v xmllint >/dev/null 2>&1; then verbose "xmllint not found; falling back to lightweight XML parsing" fi } make_temp_root() { TMP_ROOT=$(mktemp -d "${TMPDIR:-/tmp}/classify-recovered-archives.XXXXXX") || die "failed to create temp directory" } reset_detection() { DETECTED_GROUP="Unknown" DETECTED_SUFFIX="" DETECTED_BASENAME="" DETECTED_CONFIDENCE="low" DETECTED_REASON="no strong signature found" } set_detection() { DETECTED_GROUP=$1 DETECTED_SUFFIX=$2 DETECTED_CONFIDENCE=$3 DETECTED_REASON=$4 DETECTED_BASENAME=${5:-} } trim_value() { local value="$1" value="${value//$'\r'/ }" value="${value//$'\n'/ }" value="${value//$'\t'/ }" value=${value##[[:space:]]##} value=${value%%[[:space:]]##} print -r -- "$value" } sanitize_name() { local value="$1" value=$(trim_value "$value") value=${value//$'\0'/} value=${value//\//-} value=${value//:/-} value=${value//\\/-} value=$(print -r -- "$value" | tr -s ' ') value=${value##.##} value=${value%%[[:space:]]##} value=${value##[[:space:]]##} if [[ -z "$value" ]]; then value="Untitled" fi print -r -- "$value" } xml_extract_title() { local xml_file="$1" perl -0ne ' my $text = $_; if ($text =~ m{<(?:[[:alnum:]_]+:)?title\b[^>]*>(.*?)}is) { my $value = $1; $value =~ s/<[^>]+>//g; $value =~ s/&/&/g; $value =~ s/<//g; $value =~ s/"/"/g; $value =~ s/'/'"'"'/g; $value =~ s/[\r\n\t]+/ /g; $value =~ s/^\s+|\s+$//g; print $value if length $value; } ' -- "$xml_file" } plist_extract_title() { local plist_file="$1" local xml_file="$TMP_ROOT/${RANDOM}-plist.xml" if ! plutil -convert xml1 -o "$xml_file" "$plist_file" >/dev/null 2>&1; then rm -f -- "$xml_file" return 0 fi perl -0ne ' my $xml = $_; my %pairs; while ($xml =~ m{([^<]+)\s*(?:(.*?)|(.*?))}sg) { my ($key, $string, $date) = ($1, $2, $3); my $value = defined $string ? $string : $date; next unless defined $value; $value =~ s/&/&/g; $value =~ s/<//g; $value =~ s/"/"/g; $value =~ s/'/'"'"'/g; $value =~ s/[\r\n\t]+/ /g; $value =~ s/^\s+|\s+$//g; push @{ $pairs{$key} }, $value if length $value; } for my $preferred (qw(kMDItemTitle DocumentTitle documentTitle Title title kMDItemDisplayName displayName Name name)) { if (exists $pairs{$preferred} && @{ $pairs{$preferred} }) { print $pairs{$preferred}[0]; exit 0; } } for my $key (sort keys %pairs) { next unless $key =~ /(title|name)/i; if (@{ $pairs{$key} }) { print $pairs{$key}[0]; exit 0; } } ' -- "$xml_file" rm -f -- "$xml_file" } plist_extract_value() { local plist_file="$1" local key_name="$2" local xml_file="$TMP_ROOT/${RANDOM}-plist-value.xml" if ! plutil -convert xml1 -o "$xml_file" "$plist_file" >/dev/null 2>&1; then rm -f -- "$xml_file" return 0 fi TARGET_PLIST_KEY="$key_name" perl -0ne ' my $target_key = $ENV{TARGET_PLIST_KEY}; my $xml = $_; if ($xml =~ m{\Q$target_key\E\s*<(string|date|true|false)>(.*?)|\Q$target_key\E\s*<(true|false)\s*/>}sg) { my $tag = defined $1 ? $1 : $3; my $value = defined $2 ? $2 : $tag; $value = $tag if $tag eq q{true} || $tag eq q{false}; $value =~ s/&/&/g; $value =~ s/<//g; $value =~ s/"/"/g; $value =~ s/'/'"'"'/g; $value =~ s/[\r\n\t]+/ /g; $value =~ s/^\s+|\s+$//g; print $value; } ' -- "$xml_file" rm -f -- "$xml_file" } extract_zip_entry_to_temp() { local archive="$1" local entry_name="$2" local destination="$TMP_ROOT/${RANDOM}-${entry_name:t}" if unzip -p "$archive" "$entry_name" > "$destination" 2>/dev/null; then print -r -- "$destination" return 0 fi rm -f -- "$destination" return 1 } zip_listing() { unzip -Z1 "$1" 2>/dev/null } zip_has_entry() { local listing="$1" local pattern="$2" print -r -- "$listing" | grep -E -q -- "$pattern" } zip_string_markers() { strings -a "$1" 2>/dev/null } text_has_marker() { local text="$1" local pattern="$2" print -r -- "$text" | grep -E -q -- "$pattern" } archive_has_binary_marker() { local archive="$1" local pattern="$2" LC_ALL=C grep -aE -q -- "$pattern" "$archive" } extract_zip_title() { local archive="$1" local entry_name="$2" local extracted="" local title="" extracted=$(extract_zip_entry_to_temp "$archive" "$entry_name") || return 0 case "$entry_name" in *.plist) title=$(plist_extract_title "$extracted") ;; *.xml) title=$(xml_extract_title "$extracted") ;; esac rm -f -- "$extracted" print -r -- "$title" } map_odf_mimetype() { case "$1" in application/vnd.oasis.opendocument.text) print -r -- "OpenDocument-Text|odt" ;; application/vnd.oasis.opendocument.spreadsheet) print -r -- "OpenDocument-Sheet|ods" ;; application/vnd.oasis.opendocument.presentation) print -r -- "OpenDocument-Presentation|odp" ;; application/epub+zip) print -r -- "EPUB|epub" ;; *) return 1 ;; esac } classify_zip() { local archive="$1" local listing="" local title="" local mime_file="" local mime_value="" local mapped="" local iwork_properties="" local is_multi_page="" listing=$(zip_listing "$archive") || { if archive_has_binary_marker "$archive" 'Metadata/DocumentProperties\.plist|Pages/'; then set_detection "Damaged-Pages" "zip" "medium" "damaged ZIP contains Apple Pages package markers" elif archive_has_binary_marker "$archive" 'Index/Tables/'; then set_detection "Damaged-Numbers" "zip" "medium" "damaged ZIP contains Apple Numbers table markers" elif archive_has_binary_marker "$archive" 'Index/Document\.iwa' && archive_has_binary_marker "$archive" 'Index/CalculationEngine'; then set_detection "Damaged-Apple-iWork" "zip" "medium" "damaged ZIP contains Apple iWork internal markers" else set_detection "Damaged-Zip" "zip" "low" "failed to read ZIP central directory" fi return 0 } if zip_has_entry "$listing" '^Metadata/DocumentProperties\.plist$' && zip_has_entry "$listing" '^Pages/'; then title=$(extract_zip_title "$archive" 'Metadata/DocumentProperties.plist') set_detection "Pages" "pages" "high" "Apple Pages package markers found" "$title" return 0 fi if zip_has_entry "$listing" '^Index/Document\.iwa$' && zip_has_entry "$listing" '^Metadata/Properties\.plist$'; then iwork_properties=$(extract_zip_entry_to_temp "$archive" 'Metadata/Properties.plist') || iwork_properties="" if [[ -n "$iwork_properties" ]]; then is_multi_page=$(plist_extract_value "$iwork_properties" 'isMultiPage') title=$(plist_extract_title "$iwork_properties") rm -f -- "$iwork_properties" fi if zip_has_entry "$listing" '^Index/Tables/'; then set_detection "Numbers" "numbers" "high" "modern iWork package contains Numbers table markers" "$title" elif [[ "$is_multi_page" == true ]]; then set_detection "Pages" "pages" "medium" "modern iWork package with multipage marker" "$title" else set_detection "Apple-iWork" "zip" "medium" "modern iWork package detected but subtype is ambiguous" "$title" fi return 0 fi if zip_has_entry "$listing" '^\[Content_Types\]\.xml$' && zip_has_entry "$listing" '^word/'; then title=$(extract_zip_title "$archive" 'docProps/core.xml') set_detection "Word" "docx" "high" "WordprocessingML markers found" "$title" return 0 fi if zip_has_entry "$listing" '^\[Content_Types\]\.xml$' && zip_has_entry "$listing" '^xl/'; then title=$(extract_zip_title "$archive" 'docProps/core.xml') set_detection "Excel" "xlsx" "high" "SpreadsheetML markers found" "$title" return 0 fi if zip_has_entry "$listing" '^\[Content_Types\]\.xml$' && zip_has_entry "$listing" '^ppt/'; then title=$(extract_zip_title "$archive" 'docProps/core.xml') set_detection "PowerPoint" "pptx" "high" "PresentationML markers found" "$title" return 0 fi if zip_has_entry "$listing" '^mimetype$'; then mime_file=$(extract_zip_entry_to_temp "$archive" 'mimetype') || mime_file="" if [[ -n "$mime_file" ]]; then mime_value=$(trim_value "$(head -c 255 -- "$mime_file" 2>/dev/null)") rm -f -- "$mime_file" mapped=$(map_odf_mimetype "$mime_value") || mapped="" if [[ -n "$mapped" ]]; then local detected_group=${mapped%%|*} local detected_suffix=${mapped##*|} if [[ "$detected_suffix" == odt || "$detected_suffix" == ods || "$detected_suffix" == odp ]]; then title=$(extract_zip_title "$archive" 'meta.xml') fi set_detection "$detected_group" "$detected_suffix" "high" "mimetype entry identified package type" "$title" return 0 fi fi fi if zip_has_entry "$listing" '^AndroidManifest\.xml$' && zip_has_entry "$listing" '^classes\.dex$'; then set_detection "APK" "apk" "high" "Android APK markers found" return 0 fi if zip_has_entry "$listing" '^META-INF/MANIFEST\.MF$'; then set_detection "Jar" "jar" "medium" "Java archive manifest found" return 0 fi set_detection "Unknown" "zip" "low" "ZIP archive lacks a strong application signature" } gzip_original_name() { perl -e ' use strict; use warnings; my $file = shift @ARGV; open my $fh, q{<:raw}, $file or exit 0; read($fh, my $header, 10) == 10 or exit 0; my @bytes = unpack(q{C10}, $header); exit 0 unless $bytes[0] == 0x1f && $bytes[1] == 0x8b; my $flags = $bytes[3]; if ($flags & 0x04) { read($fh, my $xlen_raw, 2) == 2 or exit 0; my $xlen = unpack(q{v}, $xlen_raw); read($fh, my $discard, $xlen) == $xlen or exit 0; } if ($flags & 0x08) { my $name = q{}; while (read($fh, my $char, 1) == 1) { last if $char eq "\0"; $name .= $char; } print $name if length $name; } ' -- "$1" } derive_basename_from_hint() { local hint="$1" local suffix="$2" local base="${hint:t}" local inner_suffix="$suffix" base=${base%.gz} if [[ "$inner_suffix" == *.gz ]]; then inner_suffix=${inner_suffix%.gz} fi if [[ -n "$inner_suffix" ]]; then base=${base%.${inner_suffix}} else base=${base%.*} fi print -r -- "$base" } classify_payload_by_file_info() { local payload_file="$1" local description="$2" local mime_type="$3" case "$mime_type" in application/pdf) set_detection "PDF" "pdf.gz" "high" "gzip payload detected as PDF" return 0 ;; text/plain) set_detection "Text" "txt.gz" "medium" "gzip payload detected as plain text" return 0 ;; text/html) set_detection "HTML" "html.gz" "medium" "gzip payload detected as HTML" return 0 ;; application/xml|text/xml) set_detection "XML" "xml.gz" "medium" "gzip payload detected as XML" return 0 ;; application/json|text/json) set_detection "JSON" "json.gz" "medium" "gzip payload detected as JSON" return 0 ;; application/rtf) set_detection "RichText" "rtf.gz" "medium" "gzip payload detected as RTF" return 0 ;; image/png) set_detection "Image" "png.gz" "high" "gzip payload detected as PNG" return 0 ;; image/jpeg) set_detection "Image" "jpg.gz" "high" "gzip payload detected as JPEG" return 0 ;; image/tiff) set_detection "Image" "tiff.gz" "high" "gzip payload detected as TIFF" return 0 ;; image/gif) set_detection "Image" "gif.gz" "high" "gzip payload detected as GIF" return 0 ;; application/zip) classify_zip "$payload_file" if [[ "$DETECTED_GROUP" != "Unknown" ]]; then DETECTED_SUFFIX="${DETECTED_SUFFIX}.gz" DETECTED_REASON="gzip payload wraps a recognized ${DETECTED_GROUP} package" else set_detection "Unknown" "gz" "low" "gzip payload is ZIP data without a strong application signature" fi return 0 ;; esac if [[ "$description" == *"Microsoft Word"* ]]; then set_detection "Word" "doc.gz" "medium" "gzip payload looks like a legacy Word document" return 0 fi if [[ "$description" == *"Microsoft Excel"* ]]; then set_detection "Excel" "xls.gz" "medium" "gzip payload looks like a legacy Excel document" return 0 fi if [[ "$description" == *"Microsoft PowerPoint"* ]]; then set_detection "PowerPoint" "ppt.gz" "medium" "gzip payload looks like a legacy PowerPoint document" return 0 fi set_detection "Unknown" "gz" "low" "gzip payload type is not recognized" } classify_gz() { local archive="$1" local header_name="" local payload_file="$TMP_ROOT/${RANDOM}-payload" local mime_type="" local description="" header_name=$(gzip_original_name "$archive") if tar -tzf "$archive" >/dev/null 2>&1; then set_detection "Tar" "tar.gz" "high" "gzip payload is a TAR archive" if [[ -n "$header_name" ]]; then DETECTED_BASENAME=$(derive_basename_from_hint "$header_name" "tar.gz") fi return 0 fi if ! gzip -cd -- "$archive" > "$payload_file" 2>/dev/null; then set_detection "Unknown" "gz" "low" "failed to decompress gzip payload" return 0 fi mime_type=$(file -b --mime-type "$payload_file" 2>/dev/null) description=$(file -b "$payload_file" 2>/dev/null) classify_payload_by_file_info "$payload_file" "$description" "$mime_type" if [[ -n "$header_name" && "$DETECTED_GROUP" != "Unknown" ]]; then DETECTED_BASENAME=$(derive_basename_from_hint "$header_name" "$DETECTED_SUFFIX") fi rm -f -- "$payload_file" } is_managed_output_path() { local path="$1" local relative="${path#$SCAN_ROOT/}" local managed_dir if [[ "$relative" == Salvaged/* || "$relative" == *.salvaged/* ]]; then return 0 fi for managed_dir in $MANAGED_DIRS; do if [[ "$relative" == ${managed_dir}/* ]]; then return 0 fi done return 1 } resolve_destination() { local destination_dir="$1" local basename="$2" local suffix="$3" local candidate="$destination_dir/$basename.$suffix" local counter=2 if (( OVERWRITE )); then print -r -- "$candidate" return 0 fi while [[ -e "$candidate" ]]; do candidate="$destination_dir/$basename-$counter.$suffix" (( counter++ )) done print -r -- "$candidate" } perform_move() { local source_path="$1" local destination_path="$2" if (( DRY_RUN )); then log "DRY-RUN $source_path -> $destination_path" return 0 fi mkdir -p -- "${destination_path:h}" || return 1 if (( OVERWRITE )); then mv -f -- "$source_path" "$destination_path" else mv -- "$source_path" "$destination_path" fi } perform_copy() { local source_path="$1" local destination_path="$2" if (( DRY_RUN )); then log "DRY-RUN copy $source_path -> $destination_path" return 0 fi mkdir -p -- "${destination_path:h}" || return 1 if (( OVERWRITE )); then cp -f "$source_path" "$destination_path" else cp "$source_path" "$destination_path" fi } create_ambiguous_iwork_copies() { local source_path="$1" local destination_dir="$2" local final_basename="$3" local suffix="" local copy_path="" for suffix in $IWORK_AMBIGUOUS_SUFFIXES; do copy_path=$(resolve_destination "$destination_dir" "$final_basename" "$suffix") if perform_copy "$source_path" "$copy_path"; then verbose "prepared iWork probe copy: $copy_path" log "$source_path -> $copy_path [Apple-iWork-probe, low]" else (( FAILED_COUNT++ )) warn "failed to copy $source_path to $copy_path" fi done } process_archive() { local archive="$1" local source_name="$archive:t" local source_extension="${source_name:e:l}" local source_basename="${source_name:r}" local final_basename="" local destination_dir="" local destination_path="" if is_managed_output_path "$archive"; then (( SKIPPED_COUNT++ )) verbose "skipping managed output path: $archive" return 0 fi (( PROCESSED_COUNT++ )) reset_detection case "$source_extension" in zip) classify_zip "$archive" ;; gz) classify_gz "$archive" ;; *) set_detection "Unknown" "$source_extension" "low" "unsupported file extension" ;; esac if [[ "$DETECTED_GROUP" == "Unknown" ]]; then (( UNKNOWN_COUNT++ )) final_basename="$source_basename" DETECTED_SUFFIX=${DETECTED_SUFFIX:-$source_extension} else (( CLASSIFIED_COUNT++ )) if [[ -n "$DETECTED_BASENAME" ]]; then final_basename="$DETECTED_BASENAME" else final_basename="$source_basename" fi fi final_basename=$(sanitize_name "$final_basename") destination_dir="$SCAN_ROOT/$DETECTED_GROUP" destination_path=$(resolve_destination "$destination_dir" "$final_basename" "$DETECTED_SUFFIX") verbose "[$DETECTED_CONFIDENCE] $archive => $DETECTED_GROUP ($DETECTED_REASON)" if perform_move "$archive" "$destination_path"; then (( RENAMED_COUNT++ )) log "$archive -> $destination_path [$DETECTED_GROUP, $DETECTED_CONFIDENCE]" if [[ "$DETECTED_GROUP" == "Apple-iWork" ]]; then create_ambiguous_iwork_copies "$destination_path" "$destination_dir" "$final_basename" fi else (( FAILED_COUNT++ )) warn "failed to move $archive" fi } collect_archives() { find "$SCAN_ROOT" \ \( -path "$SCAN_ROOT/Salvaged" -o -path "$SCAN_ROOT/Salvaged/*" -o -path "$SCAN_ROOT/*.salvaged" -o -path "$SCAN_ROOT/*.salvaged/*" \) -prune \ -o -type f \( -iname '*.zip' -o -iname '*.gz' \) -print } collect_salvage_targets() { local damaged_dir for damaged_dir in Damaged-Zip Damaged-Apple-iWork Damaged-Pages Damaged-Numbers; do if [[ -d "$SCAN_ROOT/$damaged_dir" ]] && find "$SCAN_ROOT/$damaged_dir" -type f -iname '*.zip' -print -quit | grep -q .; then print -r -- "$SCAN_ROOT/$damaged_dir" fi done } run_salvage_workflow() { local salvage_target="$1" local salvage_output_root="$2" local -a salvage_cmd [[ -x "$SALVAGE_SCRIPT_PATH" ]] || die "salvage script not found or not executable: $SALVAGE_SCRIPT_PATH" salvage_cmd=("$SALVAGE_SCRIPT_PATH") if (( DRY_RUN )); then salvage_cmd+=(--dry-run) fi if (( VERBOSE )); then salvage_cmd+=(--verbose) fi salvage_cmd+=(--output "$salvage_output_root" "$salvage_target") log "Salvage $salvage_target -> $salvage_output_root" "${salvage_cmd[@]}" || warn "salvage workflow failed for $salvage_target" } parse_args() { local arg while (( $# )); do arg=$1 case "$arg" in -n|--dry-run) DRY_RUN=1 ;; -v|--verbose) VERBOSE=1 ;; --overwrite) OVERWRITE=1 ;; --salvage-damaged) SALVAGE_DAMAGED=1 ;; -h|--help) usage exit 0 ;; --) shift break ;; -*) die "unknown option: $arg" ;; *) if [[ -n "$SCAN_ROOT" ]]; then die "only one directory may be provided" fi SCAN_ROOT=$arg ;; esac shift done if [[ -z "$SCAN_ROOT" && $# -gt 0 ]]; then SCAN_ROOT=$1 shift fi [[ -n "$SCAN_ROOT" ]] || { usage exit 1 } [[ -d "$SCAN_ROOT" ]] || die "directory does not exist: $SCAN_ROOT" SCAN_ROOT=${SCAN_ROOT:A} } main() { local archive local salvage_target local salvage_output_root local -a archives local -a salvage_targets parse_args "$@" ensure_tools make_temp_root if (( DRY_RUN )); then ACTION_LABEL="Planned" fi archives=(${(f)"$(collect_archives)"}) salvage_targets=(${(f)"$(collect_salvage_targets)"}) if (( ${#archives} == 0 && (! SALVAGE_DAMAGED || ${#salvage_targets} == 0) )); then log "No .zip or .gz files found under $SCAN_ROOT" return 0 fi if (( ${#archives} > 0 )); then verbose "found ${#archives} candidate archives under $SCAN_ROOT" for archive in $archives; do process_archive "$archive" done fi if (( SALVAGE_DAMAGED )); then salvage_targets=(${(f)"$(collect_salvage_targets)"}) if (( ${#salvage_targets} == 0 )); then verbose "no damaged ZIP output folders found for salvage" else for salvage_target in $salvage_targets; do salvage_output_root="$SCAN_ROOT/Salvaged/${salvage_target:t}" run_salvage_workflow "$salvage_target" "$salvage_output_root" done fi fi log "" log "Summary" log " Processed: $PROCESSED_COUNT" log " Classified: $CLASSIFIED_COUNT" log " Unknown: $UNKNOWN_COUNT" log " ${ACTION_LABEL}: $RENAMED_COUNT" log " Skipped: $SKIPPED_COUNT" log " Failed: $FAILED_COUNT" } main "$@"