885 lines
23 KiB
Bash
Executable File
885 lines
23 KiB
Bash
Executable File
#!/bin/zsh
|
|
|
|
emulate -L zsh
|
|
setopt extended_glob no_nomatch no_unset pipefail
|
|
|
|
typeset -gr SCRIPT_NAME=${0:t}
|
|
typeset -g SCAN_ROOT=""
|
|
typeset -g DRY_RUN=0
|
|
typeset -g VERBOSE=0
|
|
typeset -g OVERWRITE=0
|
|
typeset -g SALVAGE_DAMAGED=0
|
|
typeset -g SALVAGE_SCRIPT_PATH="${0:A:h}/salvage-damaged-zips.zsh"
|
|
typeset -g TMP_ROOT=""
|
|
|
|
typeset -ga MANAGED_DIRS=(
|
|
Apple-iWork
|
|
Damaged-Apple-iWork
|
|
Damaged-Numbers
|
|
Damaged-Pages
|
|
Damaged-Zip
|
|
Numbers
|
|
Pages
|
|
Word
|
|
Excel
|
|
PowerPoint
|
|
OpenDocument-Text
|
|
OpenDocument-Sheet
|
|
OpenDocument-Presentation
|
|
EPUB
|
|
PDF
|
|
Text
|
|
HTML
|
|
XML
|
|
RichText
|
|
Image
|
|
JSON
|
|
Jar
|
|
APK
|
|
Tar
|
|
Unknown
|
|
)
|
|
|
|
typeset -gi PROCESSED_COUNT=0
|
|
typeset -gi CLASSIFIED_COUNT=0
|
|
typeset -gi UNKNOWN_COUNT=0
|
|
typeset -gi RENAMED_COUNT=0
|
|
typeset -gi SKIPPED_COUNT=0
|
|
typeset -gi FAILED_COUNT=0
|
|
typeset -g ACTION_LABEL="Renamed"
|
|
|
|
typeset -g DETECTED_GROUP="Unknown"
|
|
typeset -g DETECTED_SUFFIX=""
|
|
typeset -g DETECTED_BASENAME=""
|
|
typeset -g DETECTED_CONFIDENCE="low"
|
|
typeset -g DETECTED_REASON=""
|
|
typeset -ga IWORK_AMBIGUOUS_SUFFIXES=(pages numbers key)
|
|
|
|
usage() {
|
|
cat <<'EOF'
|
|
Usage: classify-recovered-archives.zsh [options] DIRECTORY
|
|
|
|
Scan DIRECTORY for recovered .zip and .gz files, infer the original document type,
|
|
rename them with the proper suffix, and move them into labeled subdirectories.
|
|
|
|
Options:
|
|
-n, --dry-run Print planned actions without modifying files.
|
|
-v, --verbose Print extra diagnostics while scanning.
|
|
--overwrite Allow overwriting an existing destination file.
|
|
--salvage-damaged
|
|
After classification, run salvage-damaged-zips.zsh on any
|
|
damaged ZIP output folders under the scan root.
|
|
-h, --help Show this help text.
|
|
EOF
|
|
}
|
|
|
|
log() {
|
|
print -r -- "$*"
|
|
}
|
|
|
|
verbose() {
|
|
if (( VERBOSE )); then
|
|
print -r -- "$*"
|
|
fi
|
|
}
|
|
|
|
warn() {
|
|
print -u2 -r -- "warning: $*"
|
|
}
|
|
|
|
die() {
|
|
print -u2 -r -- "error: $*"
|
|
exit 1
|
|
}
|
|
|
|
cleanup() {
|
|
if [[ -n ${TMP_ROOT:-} && -d ${TMP_ROOT:-} ]]; then
|
|
rm -rf -- "$TMP_ROOT"
|
|
fi
|
|
}
|
|
|
|
trap cleanup EXIT INT TERM
|
|
|
|
ensure_tools() {
|
|
local tool
|
|
for tool in file unzip gzip tar plutil perl mktemp find strings; do
|
|
command -v "$tool" >/dev/null 2>&1 || die "required tool not found: $tool"
|
|
done
|
|
|
|
if ! command -v xmllint >/dev/null 2>&1; then
|
|
verbose "xmllint not found; falling back to lightweight XML parsing"
|
|
fi
|
|
}
|
|
|
|
make_temp_root() {
|
|
TMP_ROOT=$(mktemp -d "${TMPDIR:-/tmp}/classify-recovered-archives.XXXXXX") || die "failed to create temp directory"
|
|
}
|
|
|
|
reset_detection() {
|
|
DETECTED_GROUP="Unknown"
|
|
DETECTED_SUFFIX=""
|
|
DETECTED_BASENAME=""
|
|
DETECTED_CONFIDENCE="low"
|
|
DETECTED_REASON="no strong signature found"
|
|
}
|
|
|
|
set_detection() {
|
|
DETECTED_GROUP=$1
|
|
DETECTED_SUFFIX=$2
|
|
DETECTED_CONFIDENCE=$3
|
|
DETECTED_REASON=$4
|
|
DETECTED_BASENAME=${5:-}
|
|
}
|
|
|
|
trim_value() {
|
|
local value="$1"
|
|
value="${value//$'\r'/ }"
|
|
value="${value//$'\n'/ }"
|
|
value="${value//$'\t'/ }"
|
|
value=${value##[[:space:]]##}
|
|
value=${value%%[[:space:]]##}
|
|
print -r -- "$value"
|
|
}
|
|
|
|
sanitize_name() {
|
|
local value="$1"
|
|
value=$(trim_value "$value")
|
|
value=${value//$'\0'/}
|
|
value=${value//\//-}
|
|
value=${value//:/-}
|
|
value=${value//\\/-}
|
|
value=$(print -r -- "$value" | tr -s ' ')
|
|
value=${value##.##}
|
|
value=${value%%[[:space:]]##}
|
|
value=${value##[[:space:]]##}
|
|
if [[ -z "$value" ]]; then
|
|
value="Untitled"
|
|
fi
|
|
print -r -- "$value"
|
|
}
|
|
|
|
xml_extract_title() {
|
|
local xml_file="$1"
|
|
|
|
perl -0ne '
|
|
my $text = $_;
|
|
if ($text =~ m{<(?:[[:alnum:]_]+:)?title\b[^>]*>(.*?)</(?:[[:alnum:]_]+:)?title>}is) {
|
|
my $value = $1;
|
|
$value =~ s/<[^>]+>//g;
|
|
$value =~ s/&/&/g;
|
|
$value =~ s/</</g;
|
|
$value =~ s/>/>/g;
|
|
$value =~ s/"/"/g;
|
|
$value =~ s/'/'"'"'/g;
|
|
$value =~ s/[\r\n\t]+/ /g;
|
|
$value =~ s/^\s+|\s+$//g;
|
|
print $value if length $value;
|
|
}
|
|
' -- "$xml_file"
|
|
}
|
|
|
|
plist_extract_title() {
|
|
local plist_file="$1"
|
|
local xml_file="$TMP_ROOT/${RANDOM}-plist.xml"
|
|
|
|
if ! plutil -convert xml1 -o "$xml_file" "$plist_file" >/dev/null 2>&1; then
|
|
rm -f -- "$xml_file"
|
|
return 0
|
|
fi
|
|
|
|
perl -0ne '
|
|
my $xml = $_;
|
|
my %pairs;
|
|
while ($xml =~ m{<key>([^<]+)</key>\s*(?:<string>(.*?)</string>|<date>(.*?)</date>)}sg) {
|
|
my ($key, $string, $date) = ($1, $2, $3);
|
|
my $value = defined $string ? $string : $date;
|
|
next unless defined $value;
|
|
$value =~ s/&/&/g;
|
|
$value =~ s/</</g;
|
|
$value =~ s/>/>/g;
|
|
$value =~ s/"/"/g;
|
|
$value =~ s/'/'"'"'/g;
|
|
$value =~ s/[\r\n\t]+/ /g;
|
|
$value =~ s/^\s+|\s+$//g;
|
|
push @{ $pairs{$key} }, $value if length $value;
|
|
}
|
|
|
|
for my $preferred (qw(kMDItemTitle DocumentTitle documentTitle Title title kMDItemDisplayName displayName Name name)) {
|
|
if (exists $pairs{$preferred} && @{ $pairs{$preferred} }) {
|
|
print $pairs{$preferred}[0];
|
|
exit 0;
|
|
}
|
|
}
|
|
|
|
for my $key (sort keys %pairs) {
|
|
next unless $key =~ /(title|name)/i;
|
|
if (@{ $pairs{$key} }) {
|
|
print $pairs{$key}[0];
|
|
exit 0;
|
|
}
|
|
}
|
|
' -- "$xml_file"
|
|
|
|
rm -f -- "$xml_file"
|
|
}
|
|
|
|
plist_extract_value() {
|
|
local plist_file="$1"
|
|
local key_name="$2"
|
|
local xml_file="$TMP_ROOT/${RANDOM}-plist-value.xml"
|
|
|
|
if ! plutil -convert xml1 -o "$xml_file" "$plist_file" >/dev/null 2>&1; then
|
|
rm -f -- "$xml_file"
|
|
return 0
|
|
fi
|
|
|
|
TARGET_PLIST_KEY="$key_name" perl -0ne '
|
|
my $target_key = $ENV{TARGET_PLIST_KEY};
|
|
my $xml = $_;
|
|
if ($xml =~ m{<key>\Q$target_key\E</key>\s*<(string|date|true|false)>(.*?)</\1>|<key>\Q$target_key\E</key>\s*<(true|false)\s*/>}sg) {
|
|
my $tag = defined $1 ? $1 : $3;
|
|
my $value = defined $2 ? $2 : $tag;
|
|
$value = $tag if $tag eq q{true} || $tag eq q{false};
|
|
$value =~ s/&/&/g;
|
|
$value =~ s/</</g;
|
|
$value =~ s/>/>/g;
|
|
$value =~ s/"/"/g;
|
|
$value =~ s/'/'"'"'/g;
|
|
$value =~ s/[\r\n\t]+/ /g;
|
|
$value =~ s/^\s+|\s+$//g;
|
|
print $value;
|
|
}
|
|
' -- "$xml_file"
|
|
|
|
rm -f -- "$xml_file"
|
|
}
|
|
|
|
extract_zip_entry_to_temp() {
|
|
local archive="$1"
|
|
local entry_name="$2"
|
|
local destination="$TMP_ROOT/${RANDOM}-${entry_name:t}"
|
|
|
|
if unzip -p "$archive" "$entry_name" > "$destination" 2>/dev/null; then
|
|
print -r -- "$destination"
|
|
return 0
|
|
fi
|
|
|
|
rm -f -- "$destination"
|
|
return 1
|
|
}
|
|
|
|
zip_listing() {
|
|
unzip -Z1 "$1" 2>/dev/null
|
|
}
|
|
|
|
zip_has_entry() {
|
|
local listing="$1"
|
|
local pattern="$2"
|
|
print -r -- "$listing" | grep -E -q -- "$pattern"
|
|
}
|
|
|
|
zip_string_markers() {
|
|
strings -a "$1" 2>/dev/null
|
|
}
|
|
|
|
text_has_marker() {
|
|
local text="$1"
|
|
local pattern="$2"
|
|
print -r -- "$text" | grep -E -q -- "$pattern"
|
|
}
|
|
|
|
archive_has_binary_marker() {
|
|
local archive="$1"
|
|
local pattern="$2"
|
|
LC_ALL=C grep -aE -q -- "$pattern" "$archive"
|
|
}
|
|
|
|
extract_zip_title() {
|
|
local archive="$1"
|
|
local entry_name="$2"
|
|
local extracted=""
|
|
local title=""
|
|
|
|
extracted=$(extract_zip_entry_to_temp "$archive" "$entry_name") || return 0
|
|
|
|
case "$entry_name" in
|
|
*.plist)
|
|
title=$(plist_extract_title "$extracted")
|
|
;;
|
|
*.xml)
|
|
title=$(xml_extract_title "$extracted")
|
|
;;
|
|
esac
|
|
|
|
rm -f -- "$extracted"
|
|
print -r -- "$title"
|
|
}
|
|
|
|
map_odf_mimetype() {
|
|
case "$1" in
|
|
application/vnd.oasis.opendocument.text)
|
|
print -r -- "OpenDocument-Text|odt"
|
|
;;
|
|
application/vnd.oasis.opendocument.spreadsheet)
|
|
print -r -- "OpenDocument-Sheet|ods"
|
|
;;
|
|
application/vnd.oasis.opendocument.presentation)
|
|
print -r -- "OpenDocument-Presentation|odp"
|
|
;;
|
|
application/epub+zip)
|
|
print -r -- "EPUB|epub"
|
|
;;
|
|
*)
|
|
return 1
|
|
;;
|
|
esac
|
|
}
|
|
|
|
classify_zip() {
|
|
local archive="$1"
|
|
local listing=""
|
|
local title=""
|
|
local mime_file=""
|
|
local mime_value=""
|
|
local mapped=""
|
|
local iwork_properties=""
|
|
local is_multi_page=""
|
|
|
|
listing=$(zip_listing "$archive") || {
|
|
if archive_has_binary_marker "$archive" 'Metadata/DocumentProperties\.plist|Pages/'; then
|
|
set_detection "Damaged-Pages" "zip" "medium" "damaged ZIP contains Apple Pages package markers"
|
|
elif archive_has_binary_marker "$archive" 'Index/Tables/'; then
|
|
set_detection "Damaged-Numbers" "zip" "medium" "damaged ZIP contains Apple Numbers table markers"
|
|
elif archive_has_binary_marker "$archive" 'Index/Document\.iwa' && archive_has_binary_marker "$archive" 'Index/CalculationEngine'; then
|
|
set_detection "Damaged-Apple-iWork" "zip" "medium" "damaged ZIP contains Apple iWork internal markers"
|
|
else
|
|
set_detection "Damaged-Zip" "zip" "low" "failed to read ZIP central directory"
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
if zip_has_entry "$listing" '^Metadata/DocumentProperties\.plist$' && zip_has_entry "$listing" '^Pages/'; then
|
|
title=$(extract_zip_title "$archive" 'Metadata/DocumentProperties.plist')
|
|
set_detection "Pages" "pages" "high" "Apple Pages package markers found" "$title"
|
|
return 0
|
|
fi
|
|
|
|
if zip_has_entry "$listing" '^Index/Document\.iwa$' && zip_has_entry "$listing" '^Metadata/Properties\.plist$'; then
|
|
iwork_properties=$(extract_zip_entry_to_temp "$archive" 'Metadata/Properties.plist') || iwork_properties=""
|
|
if [[ -n "$iwork_properties" ]]; then
|
|
is_multi_page=$(plist_extract_value "$iwork_properties" 'isMultiPage')
|
|
title=$(plist_extract_title "$iwork_properties")
|
|
rm -f -- "$iwork_properties"
|
|
fi
|
|
|
|
if zip_has_entry "$listing" '^Index/Tables/'; then
|
|
set_detection "Numbers" "numbers" "high" "modern iWork package contains Numbers table markers" "$title"
|
|
elif [[ "$is_multi_page" == true ]]; then
|
|
set_detection "Pages" "pages" "medium" "modern iWork package with multipage marker" "$title"
|
|
else
|
|
set_detection "Apple-iWork" "zip" "medium" "modern iWork package detected but subtype is ambiguous" "$title"
|
|
fi
|
|
return 0
|
|
fi
|
|
|
|
if zip_has_entry "$listing" '^\[Content_Types\]\.xml$' && zip_has_entry "$listing" '^word/'; then
|
|
title=$(extract_zip_title "$archive" 'docProps/core.xml')
|
|
set_detection "Word" "docx" "high" "WordprocessingML markers found" "$title"
|
|
return 0
|
|
fi
|
|
|
|
if zip_has_entry "$listing" '^\[Content_Types\]\.xml$' && zip_has_entry "$listing" '^xl/'; then
|
|
title=$(extract_zip_title "$archive" 'docProps/core.xml')
|
|
set_detection "Excel" "xlsx" "high" "SpreadsheetML markers found" "$title"
|
|
return 0
|
|
fi
|
|
|
|
if zip_has_entry "$listing" '^\[Content_Types\]\.xml$' && zip_has_entry "$listing" '^ppt/'; then
|
|
title=$(extract_zip_title "$archive" 'docProps/core.xml')
|
|
set_detection "PowerPoint" "pptx" "high" "PresentationML markers found" "$title"
|
|
return 0
|
|
fi
|
|
|
|
if zip_has_entry "$listing" '^mimetype$'; then
|
|
mime_file=$(extract_zip_entry_to_temp "$archive" 'mimetype') || mime_file=""
|
|
if [[ -n "$mime_file" ]]; then
|
|
mime_value=$(trim_value "$(head -c 255 -- "$mime_file" 2>/dev/null)")
|
|
rm -f -- "$mime_file"
|
|
|
|
mapped=$(map_odf_mimetype "$mime_value") || mapped=""
|
|
if [[ -n "$mapped" ]]; then
|
|
local detected_group=${mapped%%|*}
|
|
local detected_suffix=${mapped##*|}
|
|
if [[ "$detected_suffix" == odt || "$detected_suffix" == ods || "$detected_suffix" == odp ]]; then
|
|
title=$(extract_zip_title "$archive" 'meta.xml')
|
|
fi
|
|
set_detection "$detected_group" "$detected_suffix" "high" "mimetype entry identified package type" "$title"
|
|
return 0
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
if zip_has_entry "$listing" '^AndroidManifest\.xml$' && zip_has_entry "$listing" '^classes\.dex$'; then
|
|
set_detection "APK" "apk" "high" "Android APK markers found"
|
|
return 0
|
|
fi
|
|
|
|
if zip_has_entry "$listing" '^META-INF/MANIFEST\.MF$'; then
|
|
set_detection "Jar" "jar" "medium" "Java archive manifest found"
|
|
return 0
|
|
fi
|
|
|
|
set_detection "Unknown" "zip" "low" "ZIP archive lacks a strong application signature"
|
|
}
|
|
|
|
gzip_original_name() {
|
|
perl -e '
|
|
use strict;
|
|
use warnings;
|
|
|
|
my $file = shift @ARGV;
|
|
open my $fh, q{<:raw}, $file or exit 0;
|
|
read($fh, my $header, 10) == 10 or exit 0;
|
|
my @bytes = unpack(q{C10}, $header);
|
|
exit 0 unless $bytes[0] == 0x1f && $bytes[1] == 0x8b;
|
|
|
|
my $flags = $bytes[3];
|
|
|
|
if ($flags & 0x04) {
|
|
read($fh, my $xlen_raw, 2) == 2 or exit 0;
|
|
my $xlen = unpack(q{v}, $xlen_raw);
|
|
read($fh, my $discard, $xlen) == $xlen or exit 0;
|
|
}
|
|
|
|
if ($flags & 0x08) {
|
|
my $name = q{};
|
|
while (read($fh, my $char, 1) == 1) {
|
|
last if $char eq "\0";
|
|
$name .= $char;
|
|
}
|
|
print $name if length $name;
|
|
}
|
|
' -- "$1"
|
|
}
|
|
|
|
derive_basename_from_hint() {
|
|
local hint="$1"
|
|
local suffix="$2"
|
|
local base="${hint:t}"
|
|
local inner_suffix="$suffix"
|
|
|
|
base=${base%.gz}
|
|
if [[ "$inner_suffix" == *.gz ]]; then
|
|
inner_suffix=${inner_suffix%.gz}
|
|
fi
|
|
if [[ -n "$inner_suffix" ]]; then
|
|
base=${base%.${inner_suffix}}
|
|
else
|
|
base=${base%.*}
|
|
fi
|
|
|
|
print -r -- "$base"
|
|
}
|
|
|
|
classify_payload_by_file_info() {
|
|
local payload_file="$1"
|
|
local description="$2"
|
|
local mime_type="$3"
|
|
|
|
case "$mime_type" in
|
|
application/pdf)
|
|
set_detection "PDF" "pdf.gz" "high" "gzip payload detected as PDF"
|
|
return 0
|
|
;;
|
|
text/plain)
|
|
set_detection "Text" "txt.gz" "medium" "gzip payload detected as plain text"
|
|
return 0
|
|
;;
|
|
text/html)
|
|
set_detection "HTML" "html.gz" "medium" "gzip payload detected as HTML"
|
|
return 0
|
|
;;
|
|
application/xml|text/xml)
|
|
set_detection "XML" "xml.gz" "medium" "gzip payload detected as XML"
|
|
return 0
|
|
;;
|
|
application/json|text/json)
|
|
set_detection "JSON" "json.gz" "medium" "gzip payload detected as JSON"
|
|
return 0
|
|
;;
|
|
application/rtf)
|
|
set_detection "RichText" "rtf.gz" "medium" "gzip payload detected as RTF"
|
|
return 0
|
|
;;
|
|
image/png)
|
|
set_detection "Image" "png.gz" "high" "gzip payload detected as PNG"
|
|
return 0
|
|
;;
|
|
image/jpeg)
|
|
set_detection "Image" "jpg.gz" "high" "gzip payload detected as JPEG"
|
|
return 0
|
|
;;
|
|
image/tiff)
|
|
set_detection "Image" "tiff.gz" "high" "gzip payload detected as TIFF"
|
|
return 0
|
|
;;
|
|
image/gif)
|
|
set_detection "Image" "gif.gz" "high" "gzip payload detected as GIF"
|
|
return 0
|
|
;;
|
|
application/zip)
|
|
classify_zip "$payload_file"
|
|
if [[ "$DETECTED_GROUP" != "Unknown" ]]; then
|
|
DETECTED_SUFFIX="${DETECTED_SUFFIX}.gz"
|
|
DETECTED_REASON="gzip payload wraps a recognized ${DETECTED_GROUP} package"
|
|
else
|
|
set_detection "Unknown" "gz" "low" "gzip payload is ZIP data without a strong application signature"
|
|
fi
|
|
return 0
|
|
;;
|
|
esac
|
|
|
|
if [[ "$description" == *"Microsoft Word"* ]]; then
|
|
set_detection "Word" "doc.gz" "medium" "gzip payload looks like a legacy Word document"
|
|
return 0
|
|
fi
|
|
|
|
if [[ "$description" == *"Microsoft Excel"* ]]; then
|
|
set_detection "Excel" "xls.gz" "medium" "gzip payload looks like a legacy Excel document"
|
|
return 0
|
|
fi
|
|
|
|
if [[ "$description" == *"Microsoft PowerPoint"* ]]; then
|
|
set_detection "PowerPoint" "ppt.gz" "medium" "gzip payload looks like a legacy PowerPoint document"
|
|
return 0
|
|
fi
|
|
|
|
set_detection "Unknown" "gz" "low" "gzip payload type is not recognized"
|
|
}
|
|
|
|
classify_gz() {
|
|
local archive="$1"
|
|
local header_name=""
|
|
local payload_file="$TMP_ROOT/${RANDOM}-payload"
|
|
local mime_type=""
|
|
local description=""
|
|
|
|
header_name=$(gzip_original_name "$archive")
|
|
|
|
if tar -tzf "$archive" >/dev/null 2>&1; then
|
|
set_detection "Tar" "tar.gz" "high" "gzip payload is a TAR archive"
|
|
if [[ -n "$header_name" ]]; then
|
|
DETECTED_BASENAME=$(derive_basename_from_hint "$header_name" "tar.gz")
|
|
fi
|
|
return 0
|
|
fi
|
|
|
|
if ! gzip -cd -- "$archive" > "$payload_file" 2>/dev/null; then
|
|
set_detection "Unknown" "gz" "low" "failed to decompress gzip payload"
|
|
return 0
|
|
fi
|
|
|
|
mime_type=$(file -b --mime-type "$payload_file" 2>/dev/null)
|
|
description=$(file -b "$payload_file" 2>/dev/null)
|
|
classify_payload_by_file_info "$payload_file" "$description" "$mime_type"
|
|
|
|
if [[ -n "$header_name" && "$DETECTED_GROUP" != "Unknown" ]]; then
|
|
DETECTED_BASENAME=$(derive_basename_from_hint "$header_name" "$DETECTED_SUFFIX")
|
|
fi
|
|
|
|
rm -f -- "$payload_file"
|
|
}
|
|
|
|
is_managed_output_path() {
|
|
local path="$1"
|
|
local relative="${path#$SCAN_ROOT/}"
|
|
local managed_dir
|
|
|
|
if [[ "$relative" == Salvaged/* || "$relative" == *.salvaged/* ]]; then
|
|
return 0
|
|
fi
|
|
|
|
for managed_dir in $MANAGED_DIRS; do
|
|
if [[ "$relative" == ${managed_dir}/* ]]; then
|
|
return 0
|
|
fi
|
|
done
|
|
|
|
return 1
|
|
}
|
|
|
|
resolve_destination() {
|
|
local destination_dir="$1"
|
|
local basename="$2"
|
|
local suffix="$3"
|
|
local candidate="$destination_dir/$basename.$suffix"
|
|
local counter=2
|
|
|
|
if (( OVERWRITE )); then
|
|
print -r -- "$candidate"
|
|
return 0
|
|
fi
|
|
|
|
while [[ -e "$candidate" ]]; do
|
|
candidate="$destination_dir/$basename-$counter.$suffix"
|
|
(( counter++ ))
|
|
done
|
|
|
|
print -r -- "$candidate"
|
|
}
|
|
|
|
perform_move() {
|
|
local source_path="$1"
|
|
local destination_path="$2"
|
|
|
|
if (( DRY_RUN )); then
|
|
log "DRY-RUN $source_path -> $destination_path"
|
|
return 0
|
|
fi
|
|
|
|
mkdir -p -- "${destination_path:h}" || return 1
|
|
if (( OVERWRITE )); then
|
|
mv -f -- "$source_path" "$destination_path"
|
|
else
|
|
mv -- "$source_path" "$destination_path"
|
|
fi
|
|
}
|
|
|
|
perform_copy() {
|
|
local source_path="$1"
|
|
local destination_path="$2"
|
|
|
|
if (( DRY_RUN )); then
|
|
log "DRY-RUN copy $source_path -> $destination_path"
|
|
return 0
|
|
fi
|
|
|
|
mkdir -p -- "${destination_path:h}" || return 1
|
|
if (( OVERWRITE )); then
|
|
cp -f "$source_path" "$destination_path"
|
|
else
|
|
cp "$source_path" "$destination_path"
|
|
fi
|
|
}
|
|
|
|
create_ambiguous_iwork_copies() {
|
|
local source_path="$1"
|
|
local destination_dir="$2"
|
|
local final_basename="$3"
|
|
local suffix=""
|
|
local copy_path=""
|
|
|
|
for suffix in $IWORK_AMBIGUOUS_SUFFIXES; do
|
|
copy_path=$(resolve_destination "$destination_dir" "$final_basename" "$suffix")
|
|
if perform_copy "$source_path" "$copy_path"; then
|
|
verbose "prepared iWork probe copy: $copy_path"
|
|
log "$source_path -> $copy_path [Apple-iWork-probe, low]"
|
|
else
|
|
(( FAILED_COUNT++ ))
|
|
warn "failed to copy $source_path to $copy_path"
|
|
fi
|
|
done
|
|
}
|
|
|
|
process_archive() {
|
|
local archive="$1"
|
|
local source_name="$archive:t"
|
|
local source_extension="${source_name:e:l}"
|
|
local source_basename="${source_name:r}"
|
|
local final_basename=""
|
|
local destination_dir=""
|
|
local destination_path=""
|
|
|
|
if is_managed_output_path "$archive"; then
|
|
(( SKIPPED_COUNT++ ))
|
|
verbose "skipping managed output path: $archive"
|
|
return 0
|
|
fi
|
|
|
|
(( PROCESSED_COUNT++ ))
|
|
reset_detection
|
|
|
|
case "$source_extension" in
|
|
zip)
|
|
classify_zip "$archive"
|
|
;;
|
|
gz)
|
|
classify_gz "$archive"
|
|
;;
|
|
*)
|
|
set_detection "Unknown" "$source_extension" "low" "unsupported file extension"
|
|
;;
|
|
esac
|
|
|
|
if [[ "$DETECTED_GROUP" == "Unknown" ]]; then
|
|
(( UNKNOWN_COUNT++ ))
|
|
final_basename="$source_basename"
|
|
DETECTED_SUFFIX=${DETECTED_SUFFIX:-$source_extension}
|
|
else
|
|
(( CLASSIFIED_COUNT++ ))
|
|
if [[ -n "$DETECTED_BASENAME" ]]; then
|
|
final_basename="$DETECTED_BASENAME"
|
|
else
|
|
final_basename="$source_basename"
|
|
fi
|
|
fi
|
|
|
|
final_basename=$(sanitize_name "$final_basename")
|
|
destination_dir="$SCAN_ROOT/$DETECTED_GROUP"
|
|
destination_path=$(resolve_destination "$destination_dir" "$final_basename" "$DETECTED_SUFFIX")
|
|
|
|
verbose "[$DETECTED_CONFIDENCE] $archive => $DETECTED_GROUP ($DETECTED_REASON)"
|
|
if perform_move "$archive" "$destination_path"; then
|
|
(( RENAMED_COUNT++ ))
|
|
log "$archive -> $destination_path [$DETECTED_GROUP, $DETECTED_CONFIDENCE]"
|
|
if [[ "$DETECTED_GROUP" == "Apple-iWork" ]]; then
|
|
create_ambiguous_iwork_copies "$destination_path" "$destination_dir" "$final_basename"
|
|
fi
|
|
else
|
|
(( FAILED_COUNT++ ))
|
|
warn "failed to move $archive"
|
|
fi
|
|
}
|
|
|
|
collect_archives() {
|
|
find "$SCAN_ROOT" \
|
|
\( -path "$SCAN_ROOT/Salvaged" -o -path "$SCAN_ROOT/Salvaged/*" -o -path "$SCAN_ROOT/*.salvaged" -o -path "$SCAN_ROOT/*.salvaged/*" \) -prune \
|
|
-o -type f \( -iname '*.zip' -o -iname '*.gz' \) -print
|
|
}
|
|
|
|
collect_salvage_targets() {
|
|
local damaged_dir
|
|
|
|
for damaged_dir in Damaged-Zip Damaged-Apple-iWork Damaged-Pages Damaged-Numbers; do
|
|
if [[ -d "$SCAN_ROOT/$damaged_dir" ]] && find "$SCAN_ROOT/$damaged_dir" -type f -iname '*.zip' -print -quit | grep -q .; then
|
|
print -r -- "$SCAN_ROOT/$damaged_dir"
|
|
fi
|
|
done
|
|
}
|
|
|
|
run_salvage_workflow() {
|
|
local salvage_target="$1"
|
|
local salvage_output_root="$2"
|
|
local -a salvage_cmd
|
|
|
|
[[ -x "$SALVAGE_SCRIPT_PATH" ]] || die "salvage script not found or not executable: $SALVAGE_SCRIPT_PATH"
|
|
|
|
salvage_cmd=("$SALVAGE_SCRIPT_PATH")
|
|
if (( DRY_RUN )); then
|
|
salvage_cmd+=(--dry-run)
|
|
fi
|
|
if (( VERBOSE )); then
|
|
salvage_cmd+=(--verbose)
|
|
fi
|
|
salvage_cmd+=(--output "$salvage_output_root" "$salvage_target")
|
|
|
|
log "Salvage $salvage_target -> $salvage_output_root"
|
|
"${salvage_cmd[@]}" || warn "salvage workflow failed for $salvage_target"
|
|
}
|
|
|
|
parse_args() {
|
|
local arg
|
|
|
|
while (( $# )); do
|
|
arg=$1
|
|
case "$arg" in
|
|
-n|--dry-run)
|
|
DRY_RUN=1
|
|
;;
|
|
-v|--verbose)
|
|
VERBOSE=1
|
|
;;
|
|
--overwrite)
|
|
OVERWRITE=1
|
|
;;
|
|
--salvage-damaged)
|
|
SALVAGE_DAMAGED=1
|
|
;;
|
|
-h|--help)
|
|
usage
|
|
exit 0
|
|
;;
|
|
--)
|
|
shift
|
|
break
|
|
;;
|
|
-*)
|
|
die "unknown option: $arg"
|
|
;;
|
|
*)
|
|
if [[ -n "$SCAN_ROOT" ]]; then
|
|
die "only one directory may be provided"
|
|
fi
|
|
SCAN_ROOT=$arg
|
|
;;
|
|
esac
|
|
shift
|
|
done
|
|
|
|
if [[ -z "$SCAN_ROOT" && $# -gt 0 ]]; then
|
|
SCAN_ROOT=$1
|
|
shift
|
|
fi
|
|
|
|
[[ -n "$SCAN_ROOT" ]] || {
|
|
usage
|
|
exit 1
|
|
}
|
|
|
|
[[ -d "$SCAN_ROOT" ]] || die "directory does not exist: $SCAN_ROOT"
|
|
SCAN_ROOT=${SCAN_ROOT:A}
|
|
}
|
|
|
|
main() {
|
|
local archive
|
|
local salvage_target
|
|
local salvage_output_root
|
|
local -a archives
|
|
local -a salvage_targets
|
|
|
|
parse_args "$@"
|
|
ensure_tools
|
|
make_temp_root
|
|
if (( DRY_RUN )); then
|
|
ACTION_LABEL="Planned"
|
|
fi
|
|
|
|
archives=(${(f)"$(collect_archives)"})
|
|
salvage_targets=(${(f)"$(collect_salvage_targets)"})
|
|
|
|
if (( ${#archives} == 0 && (! SALVAGE_DAMAGED || ${#salvage_targets} == 0) )); then
|
|
log "No .zip or .gz files found under $SCAN_ROOT"
|
|
return 0
|
|
fi
|
|
|
|
if (( ${#archives} > 0 )); then
|
|
verbose "found ${#archives} candidate archives under $SCAN_ROOT"
|
|
|
|
for archive in $archives; do
|
|
process_archive "$archive"
|
|
done
|
|
fi
|
|
|
|
if (( SALVAGE_DAMAGED )); then
|
|
salvage_targets=(${(f)"$(collect_salvage_targets)"})
|
|
if (( ${#salvage_targets} == 0 )); then
|
|
verbose "no damaged ZIP output folders found for salvage"
|
|
else
|
|
for salvage_target in $salvage_targets; do
|
|
salvage_output_root="$SCAN_ROOT/Salvaged/${salvage_target:t}"
|
|
run_salvage_workflow "$salvage_target" "$salvage_output_root"
|
|
done
|
|
fi
|
|
fi
|
|
|
|
log ""
|
|
log "Summary"
|
|
log " Processed: $PROCESSED_COUNT"
|
|
log " Classified: $CLASSIFIED_COUNT"
|
|
log " Unknown: $UNKNOWN_COUNT"
|
|
log " ${ACTION_LABEL}: $RENAMED_COUNT"
|
|
log " Skipped: $SKIPPED_COUNT"
|
|
log " Failed: $FAILED_COUNT"
|
|
}
|
|
|
|
main "$@" |