fallback in extractor if URN not valid

This commit is contained in:
2026-04-09 08:50:35 +02:00
parent e47e36d55c
commit 18a902bcc9

View File

@@ -402,6 +402,23 @@ def get_pdf_page_count(pdf_bytes: bytes) -> int:
return 0 return 0
def _profile_from_urn(urn: str) -> str:
"""Extract a short profile name from a Factur-X/ZUGFeRD URN.
Falls back to the last segment of the URN after '#', or 'unknown'.
Args:
urn: The full profile URN (e.g.
'urn:cen.eu:en16931:2017#compliant#urn:xeinkauf.de:kosit:xrechnung_3.0')
Returns:
Short profile name (e.g. 'xrechnung_3.0')
"""
if not urn:
return "unknown"
return urn.rsplit("#", maxsplit=1)[-1].rsplit(":", maxsplit=1)[-1]
def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse: def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
"""Extract ZUGFeRD data from PDF bytes. """Extract ZUGFeRD data from PDF bytes.
@@ -460,7 +477,17 @@ def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
xml_root = etree.fromstring(xml_bytes) xml_root = etree.fromstring(xml_bytes)
flavor = get_flavor(xml_root) flavor = get_flavor(xml_root)
level = get_level(xml_root, flavor)
try:
level = get_level(xml_root, flavor)
except ValueError:
urn_nodes = xml_root.xpath(
"//rsm:ExchangedDocumentContext/"
"rsm:GuidelineSpecifiedDocumentContextParameter/ram:ID/text()",
namespaces=NAMESPACES,
)
urn = urn_nodes[0] if urn_nodes else ""
level = _profile_from_urn(urn)
xml_data = parse_zugferd_xml(xml_bytes) xml_data = parse_zugferd_xml(xml_bytes)
pdf_text = extract_text_from_pdf(pdf_bytes) pdf_text = extract_text_from_pdf(pdf_bytes)