127 lines
4.1 KiB
PHP
127 lines
4.1 KiB
PHP
<?php
|
|
|
|
namespace UbsCsvTransformer;
|
|
|
|
/**
|
|
* Extracts metadata from header lines using regex
|
|
*
|
|
* Extracts constant values from metadata lines
|
|
* (header lines before the actual CSV table) using regex rules.
|
|
*/
|
|
class MetadataExtractor
|
|
{
|
|
private array $rules;
|
|
|
|
public function __construct(array $rules = [])
|
|
{
|
|
$this->rules = $rules;
|
|
}
|
|
|
|
/**
|
|
* Extracts metadata from the provided lines
|
|
*
|
|
* @param array $lines Array of lines from the CSV header
|
|
* @return array Extracted metadata
|
|
*/
|
|
public function extract(array $lines): array
|
|
{
|
|
$metadata = [];
|
|
|
|
foreach ($this->rules as $rule) {
|
|
// Validate required fields
|
|
if (empty($rule['name']) || empty($rule['regex'])) {
|
|
continue;
|
|
}
|
|
|
|
$ruleName = $rule['name'];
|
|
$lineNumber = $rule['lineNumber'] ?? 1;
|
|
$regex = $rule['regex'];
|
|
|
|
// Off-by-one fix
|
|
// config.json: "lineNumber": 1, 2, 3 (1-based, human-readable)
|
|
// PHP arrays: $lines[0], $lines[1], $lines[2] (0-based)
|
|
// Conversion: arrayIndex = lineNumber - 1
|
|
$arrayIndex = $lineNumber - 1;
|
|
|
|
// Check if line exists
|
|
if (!isset($lines[$arrayIndex])) {
|
|
// Line does not exist - debug info for support
|
|
DebugLogger::log('metadata_warning', "Extraction rule not found", [
|
|
'rule_name' => $ruleName,
|
|
'expected_lineNumber' => $lineNumber,
|
|
'array_index' => $arrayIndex,
|
|
'available_lines' => count($lines)
|
|
]);
|
|
continue;
|
|
}
|
|
|
|
$line = $lines[$arrayIndex];
|
|
|
|
// Regex with '#' as delimiter (allows '/' in user patterns); escape '#' in pattern
|
|
$pattern = '#' . str_replace('#', '\#', $regex) . '#u';
|
|
$matchResult = @preg_match_all($pattern, $line, $matches);
|
|
if ($matchResult === false) {
|
|
DebugLogger::log('metadata_error', "Invalid regex pattern", [
|
|
'rule_name' => $ruleName,
|
|
'pattern' => $regex,
|
|
]);
|
|
continue;
|
|
}
|
|
if ($matchResult === 0) {
|
|
// Regex did not match on this line
|
|
DebugLogger::log('metadata_warning', "Regex did not match", [
|
|
'rule_name' => $ruleName,
|
|
'lineNumber' => $lineNumber,
|
|
'regex_pattern' => $regex,
|
|
'line_content' => substr($line, 0, 100)
|
|
]);
|
|
continue;
|
|
}
|
|
|
|
// Use captureGroup to select the extraction group
|
|
// captureGroup defines which capture group is extracted
|
|
// 0 = complete match
|
|
// 1 = first capture group (...)
|
|
// 2 = second capture group, etc.
|
|
$captureGroup = isset($rule['captureGroup']) ? intval($rule['captureGroup']) : 1;
|
|
|
|
// Ensure the capture group exists
|
|
if (!isset($matches[$captureGroup]) || empty($matches[$captureGroup])) {
|
|
// Fallback: use complete match if group does not exist
|
|
$metadata[$ruleName] = $matches[0][0] ?? '';
|
|
// echo "DEBUG: extraction_rule '{$ruleName}' - captureGroup {$captureGroup} not found, falling back to complete match\n";
|
|
} else {
|
|
// Use the specific capture group
|
|
$metadata[$ruleName] = $matches[$captureGroup][0] ?? '';
|
|
}
|
|
|
|
DebugLogger::log('metadata', "Extraction rule applied", [
|
|
'rule_name' => $ruleName,
|
|
'value' => $metadata[$ruleName] ?? null,
|
|
]);
|
|
}
|
|
|
|
return $metadata;
|
|
}
|
|
|
|
/**
|
|
* Returns the number of defined extraction rules
|
|
*
|
|
* @return int Number of rules
|
|
*/
|
|
public function getRuleCount(): int
|
|
{
|
|
return count($this->rules);
|
|
}
|
|
|
|
/**
|
|
* Returns all defined extraction rules
|
|
*
|
|
* @return array The rules
|
|
*/
|
|
public function getRules(): array
|
|
{
|
|
return $this->rules;
|
|
}
|
|
}
|