firefly-import-preprocessor/src/MetadataExtractor.php
2026-05-04 00:23:02 +02:00

127 lines
4.1 KiB
PHP

<?php
namespace UbsCsvTransformer;
/**
* Extracts metadata from header lines using regex
*
* Extracts constant values from metadata lines
* (header lines before the actual CSV table) using regex rules.
*/
class MetadataExtractor
{
private array $rules;
public function __construct(array $rules = [])
{
$this->rules = $rules;
}
/**
* Extracts metadata from the provided lines
*
* @param array $lines Array of lines from the CSV header
* @return array Extracted metadata
*/
public function extract(array $lines): array
{
$metadata = [];
foreach ($this->rules as $rule) {
// Validate required fields
if (empty($rule['name']) || empty($rule['regex'])) {
continue;
}
$ruleName = $rule['name'];
$lineNumber = $rule['lineNumber'] ?? 1;
$regex = $rule['regex'];
// Off-by-one fix
// config.json: "lineNumber": 1, 2, 3 (1-based, human-readable)
// PHP arrays: $lines[0], $lines[1], $lines[2] (0-based)
// Conversion: arrayIndex = lineNumber - 1
$arrayIndex = $lineNumber - 1;
// Check if line exists
if (!isset($lines[$arrayIndex])) {
// Line does not exist - debug info for support
DebugLogger::log('metadata_warning', "Extraction rule not found", [
'rule_name' => $ruleName,
'expected_lineNumber' => $lineNumber,
'array_index' => $arrayIndex,
'available_lines' => count($lines)
]);
continue;
}
$line = $lines[$arrayIndex];
// Regex with '#' as delimiter (allows '/' in user patterns); escape '#' in pattern
$pattern = '#' . str_replace('#', '\#', $regex) . '#u';
$matchResult = @preg_match_all($pattern, $line, $matches);
if ($matchResult === false) {
DebugLogger::log('metadata_error', "Invalid regex pattern", [
'rule_name' => $ruleName,
'pattern' => $regex,
]);
continue;
}
if ($matchResult === 0) {
// Regex did not match on this line
DebugLogger::log('metadata_warning', "Regex did not match", [
'rule_name' => $ruleName,
'lineNumber' => $lineNumber,
'regex_pattern' => $regex,
'line_content' => substr($line, 0, 100)
]);
continue;
}
// Use captureGroup to select the extraction group
// captureGroup defines which capture group is extracted
// 0 = complete match
// 1 = first capture group (...)
// 2 = second capture group, etc.
$captureGroup = isset($rule['captureGroup']) ? intval($rule['captureGroup']) : 1;
// Ensure the capture group exists
if (!isset($matches[$captureGroup]) || empty($matches[$captureGroup])) {
// Fallback: use complete match if group does not exist
$metadata[$ruleName] = $matches[0][0] ?? '';
// echo "DEBUG: extraction_rule '{$ruleName}' - captureGroup {$captureGroup} not found, falling back to complete match\n";
} else {
// Use the specific capture group
$metadata[$ruleName] = $matches[$captureGroup][0] ?? '';
}
DebugLogger::log('metadata', "Extraction rule applied", [
'rule_name' => $ruleName,
'value' => $metadata[$ruleName] ?? null,
]);
}
return $metadata;
}
/**
* Returns the number of defined extraction rules
*
* @return int Number of rules
*/
public function getRuleCount(): int
{
return count($this->rules);
}
/**
* Returns all defined extraction rules
*
* @return array The rules
*/
public function getRules(): array
{
return $this->rules;
}
}