fix(extractor): detect xrechnung XML attachments
This commit is contained in:
@@ -6,7 +6,6 @@ Supports ZUGFeRD 2.x profiles: MINIMUM, BASIC, BASIC WL, EN16931, EXTENDED.
|
||||
|
||||
import io
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
from facturx import get_flavor, get_level, get_xml_from_pdf
|
||||
from lxml import etree
|
||||
@@ -402,6 +401,84 @@ def get_pdf_page_count(pdf_bytes: bytes) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
def _find_xml_attachment_fallback(pdf_bytes: bytes) -> tuple[str | None, bytes | None]:
|
||||
"""Find XML attachment by MIME type or filename extension (fallback mechanism).
|
||||
|
||||
This handles XRechnung and other XML-based invoices with non-standard
|
||||
filenames (e.g., 'xrechnung.xml' instead of 'factur-x.xml') that the
|
||||
factur-x library does not recognize.
|
||||
|
||||
Args:
|
||||
pdf_bytes: Raw PDF file content.
|
||||
|
||||
Returns:
|
||||
Tuple of (filename, xml_bytes) or (None, None) if not found.
|
||||
"""
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
pdf_stream = io.BytesIO(pdf_bytes)
|
||||
reader = PdfReader(pdf_stream)
|
||||
except (PdfReadError, PyPdfError) as e:
|
||||
logger.debug(
|
||||
"Failed to read PDF for XML attachment fallback search: %s",
|
||||
str(e),
|
||||
)
|
||||
return (None, None)
|
||||
except Exception as e:
|
||||
raise ExtractionError(
|
||||
error_code="fallback_pdf_read_error",
|
||||
message="Unexpected error reading PDF for XML attachment fallback",
|
||||
details=str(e),
|
||||
) from e
|
||||
|
||||
# First pass: check MIME type via Subtype in Filespec or by content type
|
||||
# Common XML MIME types: application/xml, text/xml
|
||||
for att in reader.attachment_list:
|
||||
if not att.content:
|
||||
continue
|
||||
|
||||
subtype = getattr(att, "subtype", None) or getattr(att, "mediatype", None)
|
||||
|
||||
if subtype and ("xml" in str(subtype).lower() or "xml" in att.name.lower()):
|
||||
try:
|
||||
etree.fromstring(att.content)
|
||||
logger.debug(
|
||||
"Found XML attachment via MIME fallback: %s",
|
||||
att.name,
|
||||
)
|
||||
return (att.name, att.content)
|
||||
except etree.XMLSyntaxError:
|
||||
logger.debug(
|
||||
"Attachment '%s' has XML MIME but invalid XML content",
|
||||
att.name,
|
||||
)
|
||||
continue
|
||||
|
||||
# Second pass: check by filename extension for any XML file
|
||||
for att in reader.attachment_list:
|
||||
if not att.content:
|
||||
continue
|
||||
if att.name.lower().endswith(".xml"):
|
||||
try:
|
||||
etree.fromstring(att.content)
|
||||
logger.debug(
|
||||
"Found XML attachment via extension fallback: %s",
|
||||
att.name,
|
||||
)
|
||||
return (att.name, att.content)
|
||||
except etree.XMLSyntaxError:
|
||||
logger.debug(
|
||||
"Attachment '%s' has .xml extension but invalid XML content",
|
||||
att.name,
|
||||
)
|
||||
continue
|
||||
|
||||
return (None, None)
|
||||
|
||||
|
||||
def _profile_from_urn(urn: str) -> str:
|
||||
"""Extract a short profile name from a Factur-X/ZUGFeRD URN.
|
||||
|
||||
@@ -440,6 +517,9 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
|
||||
details=f"Size: {len(pdf_bytes)} bytes",
|
||||
)
|
||||
|
||||
xml_filename = None
|
||||
xml_bytes = None
|
||||
|
||||
try:
|
||||
xml_filename, xml_bytes = get_xml_from_pdf(pdf_bytes, check_xsd=False)
|
||||
except Exception as e:
|
||||
@@ -449,16 +529,21 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
|
||||
error_code="password_protected_pdf",
|
||||
message="PDF is password protected",
|
||||
details=str(e),
|
||||
)
|
||||
) from e
|
||||
if "pdf" in error_msg or "trailer" in error_msg or "xref" in error_msg:
|
||||
raise ExtractionError(
|
||||
error_code="invalid_pdf", message="Invalid PDF file", details=str(e)
|
||||
)
|
||||
) from e
|
||||
raise ExtractionError(
|
||||
error_code="corrupt_pdf",
|
||||
message="Failed to extract XML from PDF",
|
||||
details=str(e),
|
||||
)
|
||||
) from e
|
||||
|
||||
# Fallback: If factur-x library didn't find XML, check attachments by MIME type
|
||||
# This handles XRechnung and other XML attachments with non-standard filenames
|
||||
if not xml_bytes:
|
||||
xml_filename, xml_bytes = _find_xml_attachment_fallback(pdf_bytes)
|
||||
|
||||
if not xml_bytes:
|
||||
pdf_text = extract_text_from_pdf(pdf_bytes)
|
||||
@@ -481,9 +566,10 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
|
||||
try:
|
||||
level = get_level(xml_root, flavor)
|
||||
except ValueError:
|
||||
# GuidelineSpecifiedDocumentContextParameter uses ram: namespace
|
||||
urn_nodes = xml_root.xpath(
|
||||
"//rsm:ExchangedDocumentContext/"
|
||||
"rsm:GuidelineSpecifiedDocumentContextParameter/ram:ID/text()",
|
||||
"ram:GuidelineSpecifiedDocumentContextParameter/ram:ID/text()",
|
||||
namespaces=NAMESPACES,
|
||||
)
|
||||
urn = urn_nodes[0] if urn_nodes else ""
|
||||
|
||||
Reference in New Issue
Block a user