fix(extractor): detect xrechnung XML attachments

This commit is contained in:
m3ta-chiron
2026-05-21 14:45:20 +02:00
parent 18a902bcc9
commit d497a4cb5b
5 changed files with 159 additions and 5 deletions

5
.envrc Normal file
View File

@@ -0,0 +1,5 @@
#!/usr/bin/env bash
# Activate the devshell from the Nix flake
# This loads all tools and environment variables defined in flake.nix
use flake

3
.gitignore vendored
View File

@@ -55,3 +55,6 @@ opencode.json
.sidecar-start.sh .sidecar-start.sh
.sidecar-base .sidecar-base
.td-root .td-root
# pi
.pi*

1
.opencode-rules Symbolic link
View File

@@ -0,0 +1 @@
/nix/store/5f1zpyn23pabv4j30g651cy2bfkpi7f6-source/rules

View File

@@ -6,7 +6,6 @@ Supports ZUGFeRD 2.x profiles: MINIMUM, BASIC, BASIC WL, EN16931, EXTENDED.
import io import io
import time import time
from typing import Any
from facturx import get_flavor, get_level, get_xml_from_pdf from facturx import get_flavor, get_level, get_xml_from_pdf
from lxml import etree from lxml import etree
@@ -402,6 +401,84 @@ def get_pdf_page_count(pdf_bytes: bytes) -> int:
return 0 return 0
def _find_xml_attachment_fallback(pdf_bytes: bytes) -> tuple[str | None, bytes | None]:
"""Find XML attachment by MIME type or filename extension (fallback mechanism).
This handles XRechnung and other XML-based invoices with non-standard
filenames (e.g., 'xrechnung.xml' instead of 'factur-x.xml') that the
factur-x library does not recognize.
Args:
pdf_bytes: Raw PDF file content.
Returns:
Tuple of (filename, xml_bytes) or (None, None) if not found.
"""
import logging
logger = logging.getLogger(__name__)
try:
pdf_stream = io.BytesIO(pdf_bytes)
reader = PdfReader(pdf_stream)
except (PdfReadError, PyPdfError) as e:
logger.debug(
"Failed to read PDF for XML attachment fallback search: %s",
str(e),
)
return (None, None)
except Exception as e:
raise ExtractionError(
error_code="fallback_pdf_read_error",
message="Unexpected error reading PDF for XML attachment fallback",
details=str(e),
) from e
# First pass: check MIME type via Subtype in Filespec or by content type
# Common XML MIME types: application/xml, text/xml
for att in reader.attachment_list:
if not att.content:
continue
subtype = getattr(att, "subtype", None) or getattr(att, "mediatype", None)
if subtype and ("xml" in str(subtype).lower() or "xml" in att.name.lower()):
try:
etree.fromstring(att.content)
logger.debug(
"Found XML attachment via MIME fallback: %s",
att.name,
)
return (att.name, att.content)
except etree.XMLSyntaxError:
logger.debug(
"Attachment '%s' has XML MIME but invalid XML content",
att.name,
)
continue
# Second pass: check by filename extension for any XML file
for att in reader.attachment_list:
if not att.content:
continue
if att.name.lower().endswith(".xml"):
try:
etree.fromstring(att.content)
logger.debug(
"Found XML attachment via extension fallback: %s",
att.name,
)
return (att.name, att.content)
except etree.XMLSyntaxError:
logger.debug(
"Attachment '%s' has .xml extension but invalid XML content",
att.name,
)
continue
return (None, None)
def _profile_from_urn(urn: str) -> str: def _profile_from_urn(urn: str) -> str:
"""Extract a short profile name from a Factur-X/ZUGFeRD URN. """Extract a short profile name from a Factur-X/ZUGFeRD URN.
@@ -440,6 +517,9 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
details=f"Size: {len(pdf_bytes)} bytes", details=f"Size: {len(pdf_bytes)} bytes",
) )
xml_filename = None
xml_bytes = None
try: try:
xml_filename, xml_bytes = get_xml_from_pdf(pdf_bytes, check_xsd=False) xml_filename, xml_bytes = get_xml_from_pdf(pdf_bytes, check_xsd=False)
except Exception as e: except Exception as e:
@@ -449,16 +529,21 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
error_code="password_protected_pdf", error_code="password_protected_pdf",
message="PDF is password protected", message="PDF is password protected",
details=str(e), details=str(e),
) ) from e
if "pdf" in error_msg or "trailer" in error_msg or "xref" in error_msg: if "pdf" in error_msg or "trailer" in error_msg or "xref" in error_msg:
raise ExtractionError( raise ExtractionError(
error_code="invalid_pdf", message="Invalid PDF file", details=str(e) error_code="invalid_pdf", message="Invalid PDF file", details=str(e)
) ) from e
raise ExtractionError( raise ExtractionError(
error_code="corrupt_pdf", error_code="corrupt_pdf",
message="Failed to extract XML from PDF", message="Failed to extract XML from PDF",
details=str(e), details=str(e),
) ) from e
# Fallback: If factur-x library didn't find XML, check attachments by MIME type
# This handles XRechnung and other XML attachments with non-standard filenames
if not xml_bytes:
xml_filename, xml_bytes = _find_xml_attachment_fallback(pdf_bytes)
if not xml_bytes: if not xml_bytes:
pdf_text = extract_text_from_pdf(pdf_bytes) pdf_text = extract_text_from_pdf(pdf_bytes)
@@ -481,9 +566,10 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
try: try:
level = get_level(xml_root, flavor) level = get_level(xml_root, flavor)
except ValueError: except ValueError:
# GuidelineSpecifiedDocumentContextParameter uses ram: namespace
urn_nodes = xml_root.xpath( urn_nodes = xml_root.xpath(
"//rsm:ExchangedDocumentContext/" "//rsm:ExchangedDocumentContext/"
"rsm:GuidelineSpecifiedDocumentContextParameter/ram:ID/text()", "ram:GuidelineSpecifiedDocumentContextParameter/ram:ID/text()",
namespaces=NAMESPACES, namespaces=NAMESPACES,
) )
urn = urn_nodes[0] if urn_nodes else "" urn = urn_nodes[0] if urn_nodes else ""

View File

@@ -7,6 +7,8 @@ then implementation makes them pass (GREEN phase).
import pytest import pytest
import base64 import base64
from pypdf import PdfReader, PdfWriter
class TestExtractionError: class TestExtractionError:
"""Test ExtractionError exception class.""" """Test ExtractionError exception class."""
@@ -299,3 +301,60 @@ class TestZUGFeRDProfileVariations:
# Profile should be detected (BASIC, BASICWL, etc.) # Profile should be detected (BASIC, BASICWL, etc.)
assert result.zugferd_profil is not None assert result.zugferd_profil is not None
assert result.xml_data is not None assert result.xml_data is not None
class TestXRechnungExtraction:
"""Test extraction from XRechnung PDFs with non-standard filenames."""
def test_xrechnung_by_xml_extension_fallback(self):
"""Test XRechnung with 'xrechnung.xml' filename is detected via fallback.
This test verifies the fallback mechanism when factur-x library
doesn't recognize non-standard filenames like 'xrechnung.xml'.
We create a synthetic PDF with a non-standard attachment name to
exercise the fallback code path.
"""
from io import BytesIO
from src.extractor import extract_zugferd, _find_xml_attachment_fallback
# Load valid XML from existing test fixture
with open("tests/fixtures/validXRechnung.pdf", "rb") as f:
orig_reader = PdfReader(f)
xml_content = None
for att in orig_reader.attachment_list:
xml_content = att.content
break
if xml_content is None:
pytest.fail("Test fixture has no XML attachment")
# Create a new PDF with a non-standard attachment name
pdf_writer = PdfWriter()
pdf_writer.add_blank_page(width=72, height=72)
pdf_writer.add_attachment("xrechnung.xml", xml_content)
output = BytesIO()
pdf_writer.write(output)
pdf_bytes = output.getvalue()
# Verify factur-x does NOT recognize it (returns False/None)
from facturx import get_xml_from_pdf
facturx_result = get_xml_from_pdf(pdf_bytes, check_xsd=False)
assert facturx_result[0] is False, "factur-x should not recognize xrechnung.xml"
# Verify our fallback function DOES find it
fallback_result = _find_xml_attachment_fallback(pdf_bytes)
assert fallback_result[0] == "xrechnung.xml", (
"Fallback should find xrechnung.xml"
)
assert fallback_result[1] is not None, "Fallback should return XML content"
# Verify full extraction works via fallback
result = extract_zugferd(pdf_bytes)
assert result.is_zugferd is True
assert result.xml_raw is not None
assert result.xml_data is not None
assert result.xml_data.invoice_number is not None
assert result.extraction_meta.xml_attachment_name == "xrechnung.xml"