fix(extractor): detect xrechnung XML attachments
This commit is contained in:
5
.envrc
Normal file
5
.envrc
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Activate the devshell from the Nix flake
|
||||||
|
# This loads all tools and environment variables defined in flake.nix
|
||||||
|
|
||||||
|
use flake
|
||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -55,3 +55,6 @@ opencode.json
|
|||||||
.sidecar-start.sh
|
.sidecar-start.sh
|
||||||
.sidecar-base
|
.sidecar-base
|
||||||
.td-root
|
.td-root
|
||||||
|
|
||||||
|
# pi
|
||||||
|
.pi*
|
||||||
|
|||||||
1
.opencode-rules
Symbolic link
1
.opencode-rules
Symbolic link
@@ -0,0 +1 @@
|
|||||||
|
/nix/store/5f1zpyn23pabv4j30g651cy2bfkpi7f6-source/rules
|
||||||
@@ -6,7 +6,6 @@ Supports ZUGFeRD 2.x profiles: MINIMUM, BASIC, BASIC WL, EN16931, EXTENDED.
|
|||||||
|
|
||||||
import io
|
import io
|
||||||
import time
|
import time
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
from facturx import get_flavor, get_level, get_xml_from_pdf
|
from facturx import get_flavor, get_level, get_xml_from_pdf
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
@@ -402,6 +401,84 @@ def get_pdf_page_count(pdf_bytes: bytes) -> int:
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def _find_xml_attachment_fallback(pdf_bytes: bytes) -> tuple[str | None, bytes | None]:
|
||||||
|
"""Find XML attachment by MIME type or filename extension (fallback mechanism).
|
||||||
|
|
||||||
|
This handles XRechnung and other XML-based invoices with non-standard
|
||||||
|
filenames (e.g., 'xrechnung.xml' instead of 'factur-x.xml') that the
|
||||||
|
factur-x library does not recognize.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_bytes: Raw PDF file content.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (filename, xml_bytes) or (None, None) if not found.
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
try:
|
||||||
|
pdf_stream = io.BytesIO(pdf_bytes)
|
||||||
|
reader = PdfReader(pdf_stream)
|
||||||
|
except (PdfReadError, PyPdfError) as e:
|
||||||
|
logger.debug(
|
||||||
|
"Failed to read PDF for XML attachment fallback search: %s",
|
||||||
|
str(e),
|
||||||
|
)
|
||||||
|
return (None, None)
|
||||||
|
except Exception as e:
|
||||||
|
raise ExtractionError(
|
||||||
|
error_code="fallback_pdf_read_error",
|
||||||
|
message="Unexpected error reading PDF for XML attachment fallback",
|
||||||
|
details=str(e),
|
||||||
|
) from e
|
||||||
|
|
||||||
|
# First pass: check MIME type via Subtype in Filespec or by content type
|
||||||
|
# Common XML MIME types: application/xml, text/xml
|
||||||
|
for att in reader.attachment_list:
|
||||||
|
if not att.content:
|
||||||
|
continue
|
||||||
|
|
||||||
|
subtype = getattr(att, "subtype", None) or getattr(att, "mediatype", None)
|
||||||
|
|
||||||
|
if subtype and ("xml" in str(subtype).lower() or "xml" in att.name.lower()):
|
||||||
|
try:
|
||||||
|
etree.fromstring(att.content)
|
||||||
|
logger.debug(
|
||||||
|
"Found XML attachment via MIME fallback: %s",
|
||||||
|
att.name,
|
||||||
|
)
|
||||||
|
return (att.name, att.content)
|
||||||
|
except etree.XMLSyntaxError:
|
||||||
|
logger.debug(
|
||||||
|
"Attachment '%s' has XML MIME but invalid XML content",
|
||||||
|
att.name,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Second pass: check by filename extension for any XML file
|
||||||
|
for att in reader.attachment_list:
|
||||||
|
if not att.content:
|
||||||
|
continue
|
||||||
|
if att.name.lower().endswith(".xml"):
|
||||||
|
try:
|
||||||
|
etree.fromstring(att.content)
|
||||||
|
logger.debug(
|
||||||
|
"Found XML attachment via extension fallback: %s",
|
||||||
|
att.name,
|
||||||
|
)
|
||||||
|
return (att.name, att.content)
|
||||||
|
except etree.XMLSyntaxError:
|
||||||
|
logger.debug(
|
||||||
|
"Attachment '%s' has .xml extension but invalid XML content",
|
||||||
|
att.name,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
return (None, None)
|
||||||
|
|
||||||
|
|
||||||
def _profile_from_urn(urn: str) -> str:
|
def _profile_from_urn(urn: str) -> str:
|
||||||
"""Extract a short profile name from a Factur-X/ZUGFeRD URN.
|
"""Extract a short profile name from a Factur-X/ZUGFeRD URN.
|
||||||
|
|
||||||
@@ -440,6 +517,9 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
|
|||||||
details=f"Size: {len(pdf_bytes)} bytes",
|
details=f"Size: {len(pdf_bytes)} bytes",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
xml_filename = None
|
||||||
|
xml_bytes = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
xml_filename, xml_bytes = get_xml_from_pdf(pdf_bytes, check_xsd=False)
|
xml_filename, xml_bytes = get_xml_from_pdf(pdf_bytes, check_xsd=False)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -449,16 +529,21 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
|
|||||||
error_code="password_protected_pdf",
|
error_code="password_protected_pdf",
|
||||||
message="PDF is password protected",
|
message="PDF is password protected",
|
||||||
details=str(e),
|
details=str(e),
|
||||||
)
|
) from e
|
||||||
if "pdf" in error_msg or "trailer" in error_msg or "xref" in error_msg:
|
if "pdf" in error_msg or "trailer" in error_msg or "xref" in error_msg:
|
||||||
raise ExtractionError(
|
raise ExtractionError(
|
||||||
error_code="invalid_pdf", message="Invalid PDF file", details=str(e)
|
error_code="invalid_pdf", message="Invalid PDF file", details=str(e)
|
||||||
)
|
) from e
|
||||||
raise ExtractionError(
|
raise ExtractionError(
|
||||||
error_code="corrupt_pdf",
|
error_code="corrupt_pdf",
|
||||||
message="Failed to extract XML from PDF",
|
message="Failed to extract XML from PDF",
|
||||||
details=str(e),
|
details=str(e),
|
||||||
)
|
) from e
|
||||||
|
|
||||||
|
# Fallback: If factur-x library didn't find XML, check attachments by MIME type
|
||||||
|
# This handles XRechnung and other XML attachments with non-standard filenames
|
||||||
|
if not xml_bytes:
|
||||||
|
xml_filename, xml_bytes = _find_xml_attachment_fallback(pdf_bytes)
|
||||||
|
|
||||||
if not xml_bytes:
|
if not xml_bytes:
|
||||||
pdf_text = extract_text_from_pdf(pdf_bytes)
|
pdf_text = extract_text_from_pdf(pdf_bytes)
|
||||||
@@ -481,9 +566,10 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
|
|||||||
try:
|
try:
|
||||||
level = get_level(xml_root, flavor)
|
level = get_level(xml_root, flavor)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
# GuidelineSpecifiedDocumentContextParameter uses ram: namespace
|
||||||
urn_nodes = xml_root.xpath(
|
urn_nodes = xml_root.xpath(
|
||||||
"//rsm:ExchangedDocumentContext/"
|
"//rsm:ExchangedDocumentContext/"
|
||||||
"rsm:GuidelineSpecifiedDocumentContextParameter/ram:ID/text()",
|
"ram:GuidelineSpecifiedDocumentContextParameter/ram:ID/text()",
|
||||||
namespaces=NAMESPACES,
|
namespaces=NAMESPACES,
|
||||||
)
|
)
|
||||||
urn = urn_nodes[0] if urn_nodes else ""
|
urn = urn_nodes[0] if urn_nodes else ""
|
||||||
|
|||||||
@@ -7,6 +7,8 @@ then implementation makes them pass (GREEN phase).
|
|||||||
import pytest
|
import pytest
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
|
from pypdf import PdfReader, PdfWriter
|
||||||
|
|
||||||
|
|
||||||
class TestExtractionError:
|
class TestExtractionError:
|
||||||
"""Test ExtractionError exception class."""
|
"""Test ExtractionError exception class."""
|
||||||
@@ -299,3 +301,60 @@ class TestZUGFeRDProfileVariations:
|
|||||||
# Profile should be detected (BASIC, BASICWL, etc.)
|
# Profile should be detected (BASIC, BASICWL, etc.)
|
||||||
assert result.zugferd_profil is not None
|
assert result.zugferd_profil is not None
|
||||||
assert result.xml_data is not None
|
assert result.xml_data is not None
|
||||||
|
|
||||||
|
|
||||||
|
class TestXRechnungExtraction:
|
||||||
|
"""Test extraction from XRechnung PDFs with non-standard filenames."""
|
||||||
|
|
||||||
|
def test_xrechnung_by_xml_extension_fallback(self):
|
||||||
|
"""Test XRechnung with 'xrechnung.xml' filename is detected via fallback.
|
||||||
|
|
||||||
|
This test verifies the fallback mechanism when factur-x library
|
||||||
|
doesn't recognize non-standard filenames like 'xrechnung.xml'.
|
||||||
|
We create a synthetic PDF with a non-standard attachment name to
|
||||||
|
exercise the fallback code path.
|
||||||
|
"""
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
from src.extractor import extract_zugferd, _find_xml_attachment_fallback
|
||||||
|
|
||||||
|
# Load valid XML from existing test fixture
|
||||||
|
with open("tests/fixtures/validXRechnung.pdf", "rb") as f:
|
||||||
|
orig_reader = PdfReader(f)
|
||||||
|
xml_content = None
|
||||||
|
for att in orig_reader.attachment_list:
|
||||||
|
xml_content = att.content
|
||||||
|
break
|
||||||
|
if xml_content is None:
|
||||||
|
pytest.fail("Test fixture has no XML attachment")
|
||||||
|
|
||||||
|
# Create a new PDF with a non-standard attachment name
|
||||||
|
pdf_writer = PdfWriter()
|
||||||
|
pdf_writer.add_blank_page(width=72, height=72)
|
||||||
|
pdf_writer.add_attachment("xrechnung.xml", xml_content)
|
||||||
|
|
||||||
|
output = BytesIO()
|
||||||
|
pdf_writer.write(output)
|
||||||
|
pdf_bytes = output.getvalue()
|
||||||
|
|
||||||
|
# Verify factur-x does NOT recognize it (returns False/None)
|
||||||
|
from facturx import get_xml_from_pdf
|
||||||
|
|
||||||
|
facturx_result = get_xml_from_pdf(pdf_bytes, check_xsd=False)
|
||||||
|
assert facturx_result[0] is False, "factur-x should not recognize xrechnung.xml"
|
||||||
|
|
||||||
|
# Verify our fallback function DOES find it
|
||||||
|
fallback_result = _find_xml_attachment_fallback(pdf_bytes)
|
||||||
|
assert fallback_result[0] == "xrechnung.xml", (
|
||||||
|
"Fallback should find xrechnung.xml"
|
||||||
|
)
|
||||||
|
assert fallback_result[1] is not None, "Fallback should return XML content"
|
||||||
|
|
||||||
|
# Verify full extraction works via fallback
|
||||||
|
result = extract_zugferd(pdf_bytes)
|
||||||
|
|
||||||
|
assert result.is_zugferd is True
|
||||||
|
assert result.xml_raw is not None
|
||||||
|
assert result.xml_data is not None
|
||||||
|
assert result.xml_data.invoice_number is not None
|
||||||
|
assert result.extraction_meta.xml_attachment_name == "xrechnung.xml"
|
||||||
|
|||||||
Reference in New Issue
Block a user