diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..7d5fb9d --- /dev/null +++ b/.envrc @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +# Activate the devshell from the Nix flake +# This loads all tools and environment variables defined in flake.nix + +use flake diff --git a/.gitignore b/.gitignore index 7075402..d9be8d0 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,6 @@ opencode.json .sidecar-start.sh .sidecar-base .td-root + +# pi +.pi* diff --git a/.opencode-rules b/.opencode-rules new file mode 120000 index 0000000..befb943 --- /dev/null +++ b/.opencode-rules @@ -0,0 +1 @@ +/nix/store/5f1zpyn23pabv4j30g651cy2bfkpi7f6-source/rules \ No newline at end of file diff --git a/src/extractor.py b/src/extractor.py index 8d15978..cb9d268 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -6,7 +6,6 @@ Supports ZUGFeRD 2.x profiles: MINIMUM, BASIC, BASIC WL, EN16931, EXTENDED. import io import time -from typing import Any from facturx import get_flavor, get_level, get_xml_from_pdf from lxml import etree @@ -402,6 +401,84 @@ def get_pdf_page_count(pdf_bytes: bytes) -> int: return 0 +def _find_xml_attachment_fallback(pdf_bytes: bytes) -> tuple[str | None, bytes | None]: + """Find XML attachment by MIME type or filename extension (fallback mechanism). + + This handles XRechnung and other XML-based invoices with non-standard + filenames (e.g., 'xrechnung.xml' instead of 'factur-x.xml') that the + factur-x library does not recognize. + + Args: + pdf_bytes: Raw PDF file content. + + Returns: + Tuple of (filename, xml_bytes) or (None, None) if not found. + """ + import logging + + logger = logging.getLogger(__name__) + + try: + pdf_stream = io.BytesIO(pdf_bytes) + reader = PdfReader(pdf_stream) + except (PdfReadError, PyPdfError) as e: + logger.debug( + "Failed to read PDF for XML attachment fallback search: %s", + str(e), + ) + return (None, None) + except Exception as e: + raise ExtractionError( + error_code="fallback_pdf_read_error", + message="Unexpected error reading PDF for XML attachment fallback", + details=str(e), + ) from e + + # First pass: check MIME type via Subtype in Filespec or by content type + # Common XML MIME types: application/xml, text/xml + for att in reader.attachment_list: + if not att.content: + continue + + subtype = getattr(att, "subtype", None) or getattr(att, "mediatype", None) + + if subtype and ("xml" in str(subtype).lower() or "xml" in att.name.lower()): + try: + etree.fromstring(att.content) + logger.debug( + "Found XML attachment via MIME fallback: %s", + att.name, + ) + return (att.name, att.content) + except etree.XMLSyntaxError: + logger.debug( + "Attachment '%s' has XML MIME but invalid XML content", + att.name, + ) + continue + + # Second pass: check by filename extension for any XML file + for att in reader.attachment_list: + if not att.content: + continue + if att.name.lower().endswith(".xml"): + try: + etree.fromstring(att.content) + logger.debug( + "Found XML attachment via extension fallback: %s", + att.name, + ) + return (att.name, att.content) + except etree.XMLSyntaxError: + logger.debug( + "Attachment '%s' has .xml extension but invalid XML content", + att.name, + ) + continue + + return (None, None) + + def _profile_from_urn(urn: str) -> str: """Extract a short profile name from a Factur-X/ZUGFeRD URN. @@ -440,6 +517,9 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse: details=f"Size: {len(pdf_bytes)} bytes", ) + xml_filename = None + xml_bytes = None + try: xml_filename, xml_bytes = get_xml_from_pdf(pdf_bytes, check_xsd=False) except Exception as e: @@ -449,16 +529,21 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse: error_code="password_protected_pdf", message="PDF is password protected", details=str(e), - ) + ) from e if "pdf" in error_msg or "trailer" in error_msg or "xref" in error_msg: raise ExtractionError( error_code="invalid_pdf", message="Invalid PDF file", details=str(e) - ) + ) from e raise ExtractionError( error_code="corrupt_pdf", message="Failed to extract XML from PDF", details=str(e), - ) + ) from e + + # Fallback: If factur-x library didn't find XML, check attachments by MIME type + # This handles XRechnung and other XML attachments with non-standard filenames + if not xml_bytes: + xml_filename, xml_bytes = _find_xml_attachment_fallback(pdf_bytes) if not xml_bytes: pdf_text = extract_text_from_pdf(pdf_bytes) @@ -481,9 +566,10 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse: try: level = get_level(xml_root, flavor) except ValueError: + # GuidelineSpecifiedDocumentContextParameter uses ram: namespace urn_nodes = xml_root.xpath( "//rsm:ExchangedDocumentContext/" - "rsm:GuidelineSpecifiedDocumentContextParameter/ram:ID/text()", + "ram:GuidelineSpecifiedDocumentContextParameter/ram:ID/text()", namespaces=NAMESPACES, ) urn = urn_nodes[0] if urn_nodes else "" diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 1d4fa85..33eafb3 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -7,6 +7,8 @@ then implementation makes them pass (GREEN phase). import pytest import base64 +from pypdf import PdfReader, PdfWriter + class TestExtractionError: """Test ExtractionError exception class.""" @@ -299,3 +301,60 @@ class TestZUGFeRDProfileVariations: # Profile should be detected (BASIC, BASICWL, etc.) assert result.zugferd_profil is not None assert result.xml_data is not None + + +class TestXRechnungExtraction: + """Test extraction from XRechnung PDFs with non-standard filenames.""" + + def test_xrechnung_by_xml_extension_fallback(self): + """Test XRechnung with 'xrechnung.xml' filename is detected via fallback. + + This test verifies the fallback mechanism when factur-x library + doesn't recognize non-standard filenames like 'xrechnung.xml'. + We create a synthetic PDF with a non-standard attachment name to + exercise the fallback code path. + """ + from io import BytesIO + + from src.extractor import extract_zugferd, _find_xml_attachment_fallback + + # Load valid XML from existing test fixture + with open("tests/fixtures/validXRechnung.pdf", "rb") as f: + orig_reader = PdfReader(f) + xml_content = None + for att in orig_reader.attachment_list: + xml_content = att.content + break + if xml_content is None: + pytest.fail("Test fixture has no XML attachment") + + # Create a new PDF with a non-standard attachment name + pdf_writer = PdfWriter() + pdf_writer.add_blank_page(width=72, height=72) + pdf_writer.add_attachment("xrechnung.xml", xml_content) + + output = BytesIO() + pdf_writer.write(output) + pdf_bytes = output.getvalue() + + # Verify factur-x does NOT recognize it (returns False/None) + from facturx import get_xml_from_pdf + + facturx_result = get_xml_from_pdf(pdf_bytes, check_xsd=False) + assert facturx_result[0] is False, "factur-x should not recognize xrechnung.xml" + + # Verify our fallback function DOES find it + fallback_result = _find_xml_attachment_fallback(pdf_bytes) + assert fallback_result[0] == "xrechnung.xml", ( + "Fallback should find xrechnung.xml" + ) + assert fallback_result[1] is not None, "Fallback should return XML content" + + # Verify full extraction works via fallback + result = extract_zugferd(pdf_bytes) + + assert result.is_zugferd is True + assert result.xml_raw is not None + assert result.xml_data is not None + assert result.xml_data.invoice_number is not None + assert result.extraction_meta.xml_attachment_name == "xrechnung.xml"