diff --git a/src/extractor.py b/src/extractor.py index 5e8ed72..8d15978 100644 --- a/src/extractor.py +++ b/src/extractor.py @@ -402,6 +402,23 @@ def get_pdf_page_count(pdf_bytes: bytes) -> int: return 0 +def _profile_from_urn(urn: str) -> str: + """Extract a short profile name from a Factur-X/ZUGFeRD URN. + + Falls back to the last segment of the URN after '#', or 'unknown'. + + Args: + urn: The full profile URN (e.g. + 'urn:cen.eu:en16931:2017#compliant#urn:xeinkauf.de:kosit:xrechnung_3.0') + + Returns: + Short profile name (e.g. 'xrechnung_3.0') + """ + if not urn: + return "unknown" + return urn.rsplit("#", maxsplit=1)[-1].rsplit(":", maxsplit=1)[-1] + + def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse: """Extract ZUGFeRD data from PDF bytes. @@ -460,7 +477,17 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse: xml_root = etree.fromstring(xml_bytes) flavor = get_flavor(xml_root) - level = get_level(xml_root, flavor) + + try: + level = get_level(xml_root, flavor) + except ValueError: + urn_nodes = xml_root.xpath( + "//rsm:ExchangedDocumentContext/" + "rsm:GuidelineSpecifiedDocumentContextParameter/ram:ID/text()", + namespaces=NAMESPACES, + ) + urn = urn_nodes[0] if urn_nodes else "" + level = _profile_from_urn(urn) xml_data = parse_zugferd_xml(xml_bytes) pdf_text = extract_text_from_pdf(pdf_bytes)