classify-recovered-archives/classify-recovered-archives.zsh

#!/bin/zsh

emulate -L zsh
setopt extended_glob no_nomatch no_unset pipefail

typeset -gr SCRIPT_NAME=${0:t}
typeset -g SCAN_ROOT=""
typeset -g DRY_RUN=0
typeset -g VERBOSE=0
typeset -g OVERWRITE=0
typeset -g SALVAGE_DAMAGED=0
typeset -g SALVAGE_SCRIPT_PATH="${0:A:h}/salvage-damaged-zips.zsh"
typeset -g TMP_ROOT=""

typeset -ga MANAGED_DIRS=(
  Apple-iWork
  Damaged-Apple-iWork
  Damaged-Numbers
  Damaged-Pages
  Damaged-Zip
  Numbers
  Pages
  Word
  Excel
  PowerPoint
  OpenDocument-Text
  OpenDocument-Sheet
  OpenDocument-Presentation
  EPUB
  PDF
  Text
  HTML
  XML
  RichText
  Image
  JSON
  Jar
  APK
  Tar
  Unknown
)

typeset -gi PROCESSED_COUNT=0
typeset -gi CLASSIFIED_COUNT=0
typeset -gi UNKNOWN_COUNT=0
typeset -gi RENAMED_COUNT=0
typeset -gi SKIPPED_COUNT=0
typeset -gi FAILED_COUNT=0
typeset -g ACTION_LABEL="Renamed"

typeset -g DETECTED_GROUP="Unknown"
typeset -g DETECTED_SUFFIX=""
typeset -g DETECTED_BASENAME=""
typeset -g DETECTED_CONFIDENCE="low"
typeset -g DETECTED_REASON=""
typeset -ga IWORK_AMBIGUOUS_SUFFIXES=(pages numbers key)

usage() {
  cat <<'EOF'
Usage: classify-recovered-archives.zsh [options] DIRECTORY

Scan DIRECTORY for recovered .zip and .gz files, infer the original document type,
rename them with the proper suffix, and move them into labeled subdirectories.

Options:
  -n, --dry-run   Print planned actions without modifying files.
  -v, --verbose   Print extra diagnostics while scanning.
  --overwrite     Allow overwriting an existing destination file.
  --salvage-damaged
                   After classification, run salvage-damaged-zips.zsh on any
                   damaged ZIP output folders under the scan root.
  -h, --help      Show this help text.
EOF
}

log() {
  print -r -- "$*"
}

verbose() {
  if (( VERBOSE )); then
    print -r -- "$*"
  fi
}

warn() {
  print -u2 -r -- "warning: $*"
}

die() {
  print -u2 -r -- "error: $*"
  exit 1
}

cleanup() {
  if [[ -n ${TMP_ROOT:-} && -d ${TMP_ROOT:-} ]]; then
    rm -rf -- "$TMP_ROOT"
  fi
}

trap cleanup EXIT INT TERM

ensure_tools() {
  local tool
  for tool in file unzip gzip tar plutil perl mktemp find strings; do
    command -v "$tool" >/dev/null 2>&1 || die "required tool not found: $tool"
  done

  if ! command -v xmllint >/dev/null 2>&1; then
    verbose "xmllint not found; falling back to lightweight XML parsing"
  fi
}

make_temp_root() {
  TMP_ROOT=$(mktemp -d "${TMPDIR:-/tmp}/classify-recovered-archives.XXXXXX") || die "failed to create temp directory"
}

reset_detection() {
  DETECTED_GROUP="Unknown"
  DETECTED_SUFFIX=""
  DETECTED_BASENAME=""
  DETECTED_CONFIDENCE="low"
  DETECTED_REASON="no strong signature found"
}

set_detection() {
  DETECTED_GROUP=$1
  DETECTED_SUFFIX=$2
  DETECTED_CONFIDENCE=$3
  DETECTED_REASON=$4
  DETECTED_BASENAME=${5:-}
}

trim_value() {
  local value="$1"
  value="${value//$'\r'/ }"
  value="${value//$'\n'/ }"
  value="${value//$'\t'/ }"
  value=${value##[[:space:]]##}
  value=${value%%[[:space:]]##}
  print -r -- "$value"
}

sanitize_name() {
  local value="$1"
  value=$(trim_value "$value")
  value=${value//$'\0'/}
  value=${value//\//-}
  value=${value//:/-}
  value=${value//\\/-}
  value=$(print -r -- "$value" | tr -s ' ')
  value=${value##.##}
  value=${value%%[[:space:]]##}
  value=${value##[[:space:]]##}
  if [[ -z "$value" ]]; then
    value="Untitled"
  fi
  print -r -- "$value"
}

xml_extract_title() {
  local xml_file="$1"

  perl -0ne '
    my $text = $_;
    if ($text =~ m{<(?:[[:alnum:]_]+:)?title\b[^>]*>(.*?)</(?:[[:alnum:]_]+:)?title>}is) {
      my $value = $1;
      $value =~ s/<[^>]+>//g;
      $value =~ s/&amp;/&/g;
      $value =~ s/&lt;/</g;
      $value =~ s/&gt;/>/g;
      $value =~ s/&quot;/"/g;
      $value =~ s/&apos;/'"'"'/g;
      $value =~ s/[\r\n\t]+/ /g;
      $value =~ s/^\s+|\s+$//g;
      print $value if length $value;
    }
  ' -- "$xml_file"
}

plist_extract_title() {
  local plist_file="$1"
  local xml_file="$TMP_ROOT/${RANDOM}-plist.xml"

  if ! plutil -convert xml1 -o "$xml_file" "$plist_file" >/dev/null 2>&1; then
    rm -f -- "$xml_file"
    return 0
  fi

  perl -0ne '
    my $xml = $_;
    my %pairs;
    while ($xml =~ m{<key>([^<]+)</key>\s*(?:<string>(.*?)</string>|<date>(.*?)</date>)}sg) {
      my ($key, $string, $date) = ($1, $2, $3);
      my $value = defined $string ? $string : $date;
      next unless defined $value;
      $value =~ s/&amp;/&/g;
      $value =~ s/&lt;/</g;
      $value =~ s/&gt;/>/g;
      $value =~ s/&quot;/"/g;
      $value =~ s/&apos;/'"'"'/g;
      $value =~ s/[\r\n\t]+/ /g;
      $value =~ s/^\s+|\s+$//g;
      push @{ $pairs{$key} }, $value if length $value;
    }

    for my $preferred (qw(kMDItemTitle DocumentTitle documentTitle Title title kMDItemDisplayName displayName Name name)) {
      if (exists $pairs{$preferred} && @{ $pairs{$preferred} }) {
        print $pairs{$preferred}[0];
        exit 0;
      }
    }

    for my $key (sort keys %pairs) {
      next unless $key =~ /(title|name)/i;
      if (@{ $pairs{$key} }) {
        print $pairs{$key}[0];
        exit 0;
      }
    }
  ' -- "$xml_file"

  rm -f -- "$xml_file"
}

plist_extract_value() {
  local plist_file="$1"
  local key_name="$2"
  local xml_file="$TMP_ROOT/${RANDOM}-plist-value.xml"

  if ! plutil -convert xml1 -o "$xml_file" "$plist_file" >/dev/null 2>&1; then
    rm -f -- "$xml_file"
    return 0
  fi

  TARGET_PLIST_KEY="$key_name" perl -0ne '
    my $target_key = $ENV{TARGET_PLIST_KEY};
    my $xml = $_;
    if ($xml =~ m{<key>\Q$target_key\E</key>\s*<(string|date|true|false)>(.*?)</\1>|<key>\Q$target_key\E</key>\s*<(true|false)\s*/>}sg) {
      my $tag = defined $1 ? $1 : $3;
      my $value = defined $2 ? $2 : $tag;
      $value = $tag if $tag eq q{true} || $tag eq q{false};
      $value =~ s/&amp;/&/g;
      $value =~ s/&lt;/</g;
      $value =~ s/&gt;/>/g;
      $value =~ s/&quot;/"/g;
      $value =~ s/&apos;/'"'"'/g;
      $value =~ s/[\r\n\t]+/ /g;
      $value =~ s/^\s+|\s+$//g;
      print $value;
    }
  ' -- "$xml_file"

  rm -f -- "$xml_file"
}

extract_zip_entry_to_temp() {
  local archive="$1"
  local entry_name="$2"
  local destination="$TMP_ROOT/${RANDOM}-${entry_name:t}"

  if unzip -p "$archive" "$entry_name" > "$destination" 2>/dev/null; then
    print -r -- "$destination"
    return 0
  fi

  rm -f -- "$destination"
  return 1
}

zip_listing() {
  unzip -Z1 "$1" 2>/dev/null
}

zip_has_entry() {
  local listing="$1"
  local pattern="$2"
  print -r -- "$listing" | grep -E -q -- "$pattern"
}

zip_string_markers() {
  strings -a "$1" 2>/dev/null
}

text_has_marker() {
  local text="$1"
  local pattern="$2"
  print -r -- "$text" | grep -E -q -- "$pattern"
}

archive_has_binary_marker() {
  local archive="$1"
  local pattern="$2"
  LC_ALL=C grep -aE -q -- "$pattern" "$archive"
}

extract_zip_title() {
  local archive="$1"
  local entry_name="$2"
  local extracted=""
  local title=""

  extracted=$(extract_zip_entry_to_temp "$archive" "$entry_name") || return 0

  case "$entry_name" in
    *.plist)
      title=$(plist_extract_title "$extracted")
      ;;
    *.xml)
      title=$(xml_extract_title "$extracted")
      ;;
  esac

  rm -f -- "$extracted"
  print -r -- "$title"
}

map_odf_mimetype() {
  case "$1" in
    application/vnd.oasis.opendocument.text)
      print -r -- "OpenDocument-Text|odt"
      ;;
    application/vnd.oasis.opendocument.spreadsheet)
      print -r -- "OpenDocument-Sheet|ods"
      ;;
    application/vnd.oasis.opendocument.presentation)
      print -r -- "OpenDocument-Presentation|odp"
      ;;
    application/epub+zip)
      print -r -- "EPUB|epub"
      ;;
    *)
      return 1
      ;;
  esac
}

classify_zip() {
  local archive="$1"
  local listing=""
  local title=""
  local mime_file=""
  local mime_value=""
  local mapped=""
  local iwork_properties=""
  local is_multi_page=""

  listing=$(zip_listing "$archive") || {
    if archive_has_binary_marker "$archive" 'Metadata/DocumentProperties\.plist|Pages/'; then
      set_detection "Damaged-Pages" "zip" "medium" "damaged ZIP contains Apple Pages package markers"
    elif archive_has_binary_marker "$archive" 'Index/Tables/'; then
      set_detection "Damaged-Numbers" "zip" "medium" "damaged ZIP contains Apple Numbers table markers"
    elif archive_has_binary_marker "$archive" 'Index/Document\.iwa' && archive_has_binary_marker "$archive" 'Index/CalculationEngine'; then
      set_detection "Damaged-Apple-iWork" "zip" "medium" "damaged ZIP contains Apple iWork internal markers"
    else
      set_detection "Damaged-Zip" "zip" "low" "failed to read ZIP central directory"
    fi
    return 0
  }

  if zip_has_entry "$listing" '^Metadata/DocumentProperties\.plist$' && zip_has_entry "$listing" '^Pages/'; then
    title=$(extract_zip_title "$archive" 'Metadata/DocumentProperties.plist')
    set_detection "Pages" "pages" "high" "Apple Pages package markers found" "$title"
    return 0
  fi

  if zip_has_entry "$listing" '^Index/Document\.iwa$' && zip_has_entry "$listing" '^Metadata/Properties\.plist$'; then
    iwork_properties=$(extract_zip_entry_to_temp "$archive" 'Metadata/Properties.plist') || iwork_properties=""
    if [[ -n "$iwork_properties" ]]; then
      is_multi_page=$(plist_extract_value "$iwork_properties" 'isMultiPage')
      title=$(plist_extract_title "$iwork_properties")
      rm -f -- "$iwork_properties"
    fi

    if zip_has_entry "$listing" '^Index/Tables/'; then
      set_detection "Numbers" "numbers" "high" "modern iWork package contains Numbers table markers" "$title"
    elif [[ "$is_multi_page" == true ]]; then
      set_detection "Pages" "pages" "medium" "modern iWork package with multipage marker" "$title"
    else
      set_detection "Apple-iWork" "zip" "medium" "modern iWork package detected but subtype is ambiguous" "$title"
    fi
    return 0
  fi

  if zip_has_entry "$listing" '^\[Content_Types\]\.xml$' && zip_has_entry "$listing" '^word/'; then
    title=$(extract_zip_title "$archive" 'docProps/core.xml')
    set_detection "Word" "docx" "high" "WordprocessingML markers found" "$title"
    return 0
  fi

  if zip_has_entry "$listing" '^\[Content_Types\]\.xml$' && zip_has_entry "$listing" '^xl/'; then
    title=$(extract_zip_title "$archive" 'docProps/core.xml')
    set_detection "Excel" "xlsx" "high" "SpreadsheetML markers found" "$title"
    return 0
  fi

  if zip_has_entry "$listing" '^\[Content_Types\]\.xml$' && zip_has_entry "$listing" '^ppt/'; then
    title=$(extract_zip_title "$archive" 'docProps/core.xml')
    set_detection "PowerPoint" "pptx" "high" "PresentationML markers found" "$title"
    return 0
  fi

  if zip_has_entry "$listing" '^mimetype$'; then
    mime_file=$(extract_zip_entry_to_temp "$archive" 'mimetype') || mime_file=""
    if [[ -n "$mime_file" ]]; then
      mime_value=$(trim_value "$(head -c 255 -- "$mime_file" 2>/dev/null)")
      rm -f -- "$mime_file"

      mapped=$(map_odf_mimetype "$mime_value") || mapped=""
      if [[ -n "$mapped" ]]; then
        local detected_group=${mapped%%|*}
        local detected_suffix=${mapped##*|}
        if [[ "$detected_suffix" == odt || "$detected_suffix" == ods || "$detected_suffix" == odp ]]; then
          title=$(extract_zip_title "$archive" 'meta.xml')
        fi
        set_detection "$detected_group" "$detected_suffix" "high" "mimetype entry identified package type" "$title"
        return 0
      fi
    fi
  fi

  if zip_has_entry "$listing" '^AndroidManifest\.xml$' && zip_has_entry "$listing" '^classes\.dex$'; then
    set_detection "APK" "apk" "high" "Android APK markers found"
    return 0
  fi

  if zip_has_entry "$listing" '^META-INF/MANIFEST\.MF$'; then
    set_detection "Jar" "jar" "medium" "Java archive manifest found"
    return 0
  fi

  set_detection "Unknown" "zip" "low" "ZIP archive lacks a strong application signature"
}

gzip_original_name() {
  perl -e '
    use strict;
    use warnings;

    my $file = shift @ARGV;
    open my $fh, q{<:raw}, $file or exit 0;
    read($fh, my $header, 10) == 10 or exit 0;
    my @bytes = unpack(q{C10}, $header);
    exit 0 unless $bytes[0] == 0x1f && $bytes[1] == 0x8b;

    my $flags = $bytes[3];

    if ($flags & 0x04) {
      read($fh, my $xlen_raw, 2) == 2 or exit 0;
      my $xlen = unpack(q{v}, $xlen_raw);
      read($fh, my $discard, $xlen) == $xlen or exit 0;
    }

    if ($flags & 0x08) {
      my $name = q{};
      while (read($fh, my $char, 1) == 1) {
        last if $char eq "\0";
        $name .= $char;
      }
      print $name if length $name;
    }
  ' -- "$1"
}

derive_basename_from_hint() {
  local hint="$1"
  local suffix="$2"
  local base="${hint:t}"
  local inner_suffix="$suffix"

  base=${base%.gz}
  if [[ "$inner_suffix" == *.gz ]]; then
    inner_suffix=${inner_suffix%.gz}
  fi
  if [[ -n "$inner_suffix" ]]; then
    base=${base%.${inner_suffix}}
  else
    base=${base%.*}
  fi

  print -r -- "$base"
}

classify_payload_by_file_info() {
  local payload_file="$1"
  local description="$2"
  local mime_type="$3"

  case "$mime_type" in
    application/pdf)
      set_detection "PDF" "pdf.gz" "high" "gzip payload detected as PDF"
      return 0
      ;;
    text/plain)
      set_detection "Text" "txt.gz" "medium" "gzip payload detected as plain text"
      return 0
      ;;
    text/html)
      set_detection "HTML" "html.gz" "medium" "gzip payload detected as HTML"
      return 0
      ;;
    application/xml|text/xml)
      set_detection "XML" "xml.gz" "medium" "gzip payload detected as XML"
      return 0
      ;;
    application/json|text/json)
      set_detection "JSON" "json.gz" "medium" "gzip payload detected as JSON"
      return 0
      ;;
    application/rtf)
      set_detection "RichText" "rtf.gz" "medium" "gzip payload detected as RTF"
      return 0
      ;;
    image/png)
      set_detection "Image" "png.gz" "high" "gzip payload detected as PNG"
      return 0
      ;;
    image/jpeg)
      set_detection "Image" "jpg.gz" "high" "gzip payload detected as JPEG"
      return 0
      ;;
    image/tiff)
      set_detection "Image" "tiff.gz" "high" "gzip payload detected as TIFF"
      return 0
      ;;
    image/gif)
      set_detection "Image" "gif.gz" "high" "gzip payload detected as GIF"
      return 0
      ;;
    application/zip)
      classify_zip "$payload_file"
      if [[ "$DETECTED_GROUP" != "Unknown" ]]; then
        DETECTED_SUFFIX="${DETECTED_SUFFIX}.gz"
        DETECTED_REASON="gzip payload wraps a recognized ${DETECTED_GROUP} package"
      else
        set_detection "Unknown" "gz" "low" "gzip payload is ZIP data without a strong application signature"
      fi
      return 0
      ;;
  esac

  if [[ "$description" == *"Microsoft Word"* ]]; then
    set_detection "Word" "doc.gz" "medium" "gzip payload looks like a legacy Word document"
    return 0
  fi

  if [[ "$description" == *"Microsoft Excel"* ]]; then
    set_detection "Excel" "xls.gz" "medium" "gzip payload looks like a legacy Excel document"
    return 0
  fi

  if [[ "$description" == *"Microsoft PowerPoint"* ]]; then
    set_detection "PowerPoint" "ppt.gz" "medium" "gzip payload looks like a legacy PowerPoint document"
    return 0
  fi

  set_detection "Unknown" "gz" "low" "gzip payload type is not recognized"
}

classify_gz() {
  local archive="$1"
  local header_name=""
  local payload_file="$TMP_ROOT/${RANDOM}-payload"
  local mime_type=""
  local description=""

  header_name=$(gzip_original_name "$archive")

  if tar -tzf "$archive" >/dev/null 2>&1; then
    set_detection "Tar" "tar.gz" "high" "gzip payload is a TAR archive"
    if [[ -n "$header_name" ]]; then
      DETECTED_BASENAME=$(derive_basename_from_hint "$header_name" "tar.gz")
    fi
    return 0
  fi

  if ! gzip -cd -- "$archive" > "$payload_file" 2>/dev/null; then
    set_detection "Unknown" "gz" "low" "failed to decompress gzip payload"
    return 0
  fi

  mime_type=$(file -b --mime-type "$payload_file" 2>/dev/null)
  description=$(file -b "$payload_file" 2>/dev/null)
  classify_payload_by_file_info "$payload_file" "$description" "$mime_type"

  if [[ -n "$header_name" && "$DETECTED_GROUP" != "Unknown" ]]; then
    DETECTED_BASENAME=$(derive_basename_from_hint "$header_name" "$DETECTED_SUFFIX")
  fi

  rm -f -- "$payload_file"
}

is_managed_output_path() {
  local path="$1"
  local relative="${path#$SCAN_ROOT/}"
  local managed_dir

  if [[ "$relative" == Salvaged/* || "$relative" == *.salvaged/* ]]; then
    return 0
  fi

  for managed_dir in $MANAGED_DIRS; do
    if [[ "$relative" == ${managed_dir}/* ]]; then
      return 0
    fi
  done

  return 1
}

resolve_destination() {
  local destination_dir="$1"
  local basename="$2"
  local suffix="$3"
  local candidate="$destination_dir/$basename.$suffix"
  local counter=2

  if (( OVERWRITE )); then
    print -r -- "$candidate"
    return 0
  fi

  while [[ -e "$candidate" ]]; do
    candidate="$destination_dir/$basename-$counter.$suffix"
    (( counter++ ))
  done

  print -r -- "$candidate"
}

perform_move() {
  local source_path="$1"
  local destination_path="$2"

  if (( DRY_RUN )); then
    log "DRY-RUN  $source_path -> $destination_path"
    return 0
  fi

  mkdir -p -- "${destination_path:h}" || return 1
  if (( OVERWRITE )); then
    mv -f -- "$source_path" "$destination_path"
  else
    mv -- "$source_path" "$destination_path"
  fi
}

perform_copy() {
  local source_path="$1"
  local destination_path="$2"

  if (( DRY_RUN )); then
    log "DRY-RUN  copy $source_path -> $destination_path"
    return 0
  fi

  mkdir -p -- "${destination_path:h}" || return 1
  if (( OVERWRITE )); then
    cp -f "$source_path" "$destination_path"
  else
    cp "$source_path" "$destination_path"
  fi
}

create_ambiguous_iwork_copies() {
  local source_path="$1"
  local destination_dir="$2"
  local final_basename="$3"
  local suffix=""
  local copy_path=""

  for suffix in $IWORK_AMBIGUOUS_SUFFIXES; do
    copy_path=$(resolve_destination "$destination_dir" "$final_basename" "$suffix")
    if perform_copy "$source_path" "$copy_path"; then
      verbose "prepared iWork probe copy: $copy_path"
      log "$source_path -> $copy_path [Apple-iWork-probe, low]"
    else
      (( FAILED_COUNT++ ))
      warn "failed to copy $source_path to $copy_path"
    fi
  done
}

process_archive() {
  local archive="$1"
  local source_name="$archive:t"
  local source_extension="${source_name:e:l}"
  local source_basename="${source_name:r}"
  local final_basename=""
  local destination_dir=""
  local destination_path=""

  if is_managed_output_path "$archive"; then
    (( SKIPPED_COUNT++ ))
    verbose "skipping managed output path: $archive"
    return 0
  fi

  (( PROCESSED_COUNT++ ))
  reset_detection

  case "$source_extension" in
    zip)
      classify_zip "$archive"
      ;;
    gz)
      classify_gz "$archive"
      ;;
    *)
      set_detection "Unknown" "$source_extension" "low" "unsupported file extension"
      ;;
  esac

  if [[ "$DETECTED_GROUP" == "Unknown" ]]; then
    (( UNKNOWN_COUNT++ ))
    final_basename="$source_basename"
    DETECTED_SUFFIX=${DETECTED_SUFFIX:-$source_extension}
  else
    (( CLASSIFIED_COUNT++ ))
    if [[ -n "$DETECTED_BASENAME" ]]; then
      final_basename="$DETECTED_BASENAME"
    else
      final_basename="$source_basename"
    fi
  fi

  final_basename=$(sanitize_name "$final_basename")
  destination_dir="$SCAN_ROOT/$DETECTED_GROUP"
  destination_path=$(resolve_destination "$destination_dir" "$final_basename" "$DETECTED_SUFFIX")

  verbose "[$DETECTED_CONFIDENCE] $archive => $DETECTED_GROUP ($DETECTED_REASON)"
  if perform_move "$archive" "$destination_path"; then
    (( RENAMED_COUNT++ ))
    log "$archive -> $destination_path [$DETECTED_GROUP, $DETECTED_CONFIDENCE]"
    if [[ "$DETECTED_GROUP" == "Apple-iWork" ]]; then
      create_ambiguous_iwork_copies "$destination_path" "$destination_dir" "$final_basename"
    fi
  else
    (( FAILED_COUNT++ ))
    warn "failed to move $archive"
  fi
}

collect_archives() {
  find "$SCAN_ROOT" \
    \( -path "$SCAN_ROOT/Salvaged" -o -path "$SCAN_ROOT/Salvaged/*" -o -path "$SCAN_ROOT/*.salvaged" -o -path "$SCAN_ROOT/*.salvaged/*" \) -prune \
    -o -type f \( -iname '*.zip' -o -iname '*.gz' \) -print
}

collect_salvage_targets() {
  local damaged_dir

  for damaged_dir in Damaged-Zip Damaged-Apple-iWork Damaged-Pages Damaged-Numbers; do
    if [[ -d "$SCAN_ROOT/$damaged_dir" ]] && find "$SCAN_ROOT/$damaged_dir" -type f -iname '*.zip' -print -quit | grep -q .; then
      print -r -- "$SCAN_ROOT/$damaged_dir"
    fi
  done
}

run_salvage_workflow() {
  local salvage_target="$1"
  local salvage_output_root="$2"
  local -a salvage_cmd

  [[ -x "$SALVAGE_SCRIPT_PATH" ]] || die "salvage script not found or not executable: $SALVAGE_SCRIPT_PATH"

  salvage_cmd=("$SALVAGE_SCRIPT_PATH")
  if (( DRY_RUN )); then
    salvage_cmd+=(--dry-run)
  fi
  if (( VERBOSE )); then
    salvage_cmd+=(--verbose)
  fi
  salvage_cmd+=(--output "$salvage_output_root" "$salvage_target")

  log "Salvage  $salvage_target -> $salvage_output_root"
  "${salvage_cmd[@]}" || warn "salvage workflow failed for $salvage_target"
}

parse_args() {
  local arg

  while (( $# )); do
    arg=$1
    case "$arg" in
      -n|--dry-run)
        DRY_RUN=1
        ;;
      -v|--verbose)
        VERBOSE=1
        ;;
      --overwrite)
        OVERWRITE=1
        ;;
      --salvage-damaged)
        SALVAGE_DAMAGED=1
        ;;
      -h|--help)
        usage
        exit 0
        ;;
      --)
        shift
        break
        ;;
      -*)
        die "unknown option: $arg"
        ;;
      *)
        if [[ -n "$SCAN_ROOT" ]]; then
          die "only one directory may be provided"
        fi
        SCAN_ROOT=$arg
        ;;
    esac
    shift
  done

  if [[ -z "$SCAN_ROOT" && $# -gt 0 ]]; then
    SCAN_ROOT=$1
    shift
  fi

  [[ -n "$SCAN_ROOT" ]] || {
    usage
    exit 1
  }

  [[ -d "$SCAN_ROOT" ]] || die "directory does not exist: $SCAN_ROOT"
  SCAN_ROOT=${SCAN_ROOT:A}
}

main() {
  local archive
  local salvage_target
  local salvage_output_root
  local -a archives
  local -a salvage_targets

  parse_args "$@"
  ensure_tools
  make_temp_root
  if (( DRY_RUN )); then
    ACTION_LABEL="Planned"
  fi

  archives=(${(f)"$(collect_archives)"})
  salvage_targets=(${(f)"$(collect_salvage_targets)"})

  if (( ${#archives} == 0 && (! SALVAGE_DAMAGED || ${#salvage_targets} == 0) )); then
    log "No .zip or .gz files found under $SCAN_ROOT"
    return 0
  fi

  if (( ${#archives} > 0 )); then
    verbose "found ${#archives} candidate archives under $SCAN_ROOT"

    for archive in $archives; do
      process_archive "$archive"
    done
  fi

  if (( SALVAGE_DAMAGED )); then
    salvage_targets=(${(f)"$(collect_salvage_targets)"})
    if (( ${#salvage_targets} == 0 )); then
      verbose "no damaged ZIP output folders found for salvage"
    else
      for salvage_target in $salvage_targets; do
        salvage_output_root="$SCAN_ROOT/Salvaged/${salvage_target:t}"
        run_salvage_workflow "$salvage_target" "$salvage_output_root"
      done
    fi
  fi

  log ""
  log "Summary"
  log "  Processed:  $PROCESSED_COUNT"
  log "  Classified: $CLASSIFIED_COUNT"
  log "  Unknown:    $UNKNOWN_COUNT"
  log "  ${ACTION_LABEL}:    $RENAMED_COUNT"
  log "  Skipped:    $SKIPPED_COUNT"
  log "  Failed:     $FAILED_COUNT"
}

main "$@"