Files
zugferd-service/src/extractor.py

509 lines
16 KiB
Python

"""ZUGFeRD/Factur-X extractor.
Extracts structured invoice data from PDF files using the factur-x library.
Supports ZUGFeRD 2.x profiles: MINIMUM, BASIC, BASIC WL, EN16931, EXTENDED.
"""
import io
import time
from typing import Any
from facturx import get_flavor, get_level, get_xml_from_pdf
from lxml import etree
from pypdf import PdfReader
from pypdf.errors import PdfReadError, PyPdfError
from src.models import (
Buyer,
ExtractionMeta,
ExtractResponse,
LineItem,
PaymentTerms,
Supplier,
Totals,
VatBreakdown,
XmlData,
)
NAMESPACES = {
"rsm": "urn:un:unece:uncefact:data:standard:CrossIndustryInvoice:100",
"ram": "urn:un:unece:uncefact:data:standard:ReusableAggregateBusinessInformationEntity:100",
"udt": "urn:un:unece:uncefact:data:standard:UnqualifiedDataType:100",
}
class ExtractionError(Exception):
"""Error during PDF extraction."""
def __init__(self, error_code: str, message: str, details: str = ""):
self.error_code = error_code
self.message = message
self.details = details
super().__init__(message)
def extract_text_from_pdf(pdf_bytes: bytes) -> str:
"""Extract text from PDF using pypdf.
Args:
pdf_bytes: Raw PDF file content
Returns:
Extracted text from all pages
"""
try:
pdf_stream = io.BytesIO(pdf_bytes)
reader = PdfReader(pdf_stream)
text_parts = []
for page in reader.pages:
text = page.extract_text()
if text:
text_parts.append(text)
return "\n".join(text_parts)
except (PdfReadError, PyPdfError) as e:
raise ExtractionError(
error_code="corrupt_pdf", message="Failed to read PDF", details=str(e)
)
def parse_supplier(xml_root: etree._Element) -> Supplier:
"""Parse supplier information from XML.
Args:
xml_root: XML root element
Returns:
Supplier model
"""
name = xml_root.xpath(
"//ram:ApplicableHeaderTradeAgreement/ram:SellerTradeParty/ram:Name/text()",
namespaces=NAMESPACES,
)
street = xml_root.xpath(
"//ram:ApplicableHeaderTradeAgreement/ram:SellerTradeParty/"
"ram:PostalTradeAddress/ram:LineOne/text()",
namespaces=NAMESPACES,
)
postal_code = xml_root.xpath(
"//ram:ApplicableHeaderTradeAgreement/ram:SellerTradeParty/"
"ram:PostalTradeAddress/ram:PostcodeCode/text()",
namespaces=NAMESPACES,
)
city = xml_root.xpath(
"//ram:ApplicableHeaderTradeAgreement/ram:SellerTradeParty/"
"ram:PostalTradeAddress/ram:CityName/text()",
namespaces=NAMESPACES,
)
country = xml_root.xpath(
"//ram:ApplicableHeaderTradeAgreement/ram:SellerTradeParty/"
"ram:PostalTradeAddress/ram:CountryID/text()",
namespaces=NAMESPACES,
)
vat_id = xml_root.xpath(
"//ram:ApplicableHeaderTradeAgreement/ram:SellerTradeParty/ram:SpecifiedTaxRegistration/"
"ram:ID[@schemeID='VA']/text()",
namespaces=NAMESPACES,
)
email = xml_root.xpath(
"//ram:ApplicableHeaderTradeAgreement/ram:SellerTradeParty/"
"ram:URIUniversalCommunication/ram:URIID/text()",
namespaces=NAMESPACES,
)
return Supplier(
name=name[0] if name else "",
street=street[0] if street else None,
postal_code=postal_code[0] if postal_code else None,
city=city[0] if city else None,
country=country[0] if country else None,
vat_id=vat_id[0] if vat_id else None,
email=email[0] if email else None,
)
def parse_buyer(xml_root: etree._Element) -> Buyer:
"""Parse buyer information from XML.
Args:
xml_root: XML root element
Returns:
Buyer model
"""
name = xml_root.xpath(
"//ram:ApplicableHeaderTradeAgreement/ram:BuyerTradeParty/ram:Name/text()",
namespaces=NAMESPACES,
)
street = xml_root.xpath(
"//ram:ApplicableHeaderTradeAgreement/ram:BuyerTradeParty/"
"ram:PostalTradeAddress/ram:LineOne/text()",
namespaces=NAMESPACES,
)
postal_code = xml_root.xpath(
"//ram:ApplicableHeaderTradeAgreement/ram:BuyerTradeParty/"
"ram:PostalTradeAddress/ram:PostcodeCode/text()",
namespaces=NAMESPACES,
)
city = xml_root.xpath(
"//ram:ApplicableHeaderTradeAgreement/ram:BuyerTradeParty/"
"ram:PostalTradeAddress/ram:CityName/text()",
namespaces=NAMESPACES,
)
country = xml_root.xpath(
"//ram:ApplicableHeaderTradeAgreement/ram:BuyerTradeParty/"
"ram:PostalTradeAddress/ram:CountryID/text()",
namespaces=NAMESPACES,
)
vat_id = xml_root.xpath(
"//ram:ApplicableHeaderTradeAgreement/ram:BuyerTradeParty/ram:SpecifiedTaxRegistration/"
"ram:ID[@schemeID='VA']/text()",
namespaces=NAMESPACES,
)
return Buyer(
name=name[0] if name else "",
street=street[0] if street else None,
postal_code=postal_code[0] if postal_code else None,
city=city[0] if city else None,
country=country[0] if country else None,
vat_id=vat_id[0] if vat_id else None,
)
def parse_line_items(xml_root: etree._Element) -> list[LineItem]:
"""Parse line items from XML.
Args:
xml_root: XML root element
Returns:
List of LineItem models
"""
line_items_nodes = xml_root.xpath(
"//ram:IncludedSupplyChainTradeLineItem", namespaces=NAMESPACES
)
items = []
for idx, item_node in enumerate(line_items_nodes, start=1):
position = idx
article_number = item_node.xpath(
"./ram:SpecifiedTradeProduct/ram:SellerAssignedID/text()",
namespaces=NAMESPACES,
)
article_number_buyer = item_node.xpath(
"./ram:SpecifiedTradeProduct/ram:BuyerAssignedID/text()",
namespaces=NAMESPACES,
)
description = item_node.xpath(
"./ram:SpecifiedTradeProduct/ram:Name/text()", namespaces=NAMESPACES
)
quantity = item_node.xpath(
"./ram:SpecifiedLineTradeDelivery/ram:BilledQuantity/text()",
namespaces=NAMESPACES,
)
unit_code = item_node.xpath(
"./ram:SpecifiedLineTradeDelivery/ram:BilledQuantity/@unitCode",
namespaces=NAMESPACES,
)
unit_price = item_node.xpath(
"./ram:SpecifiedLineTradeAgreement/ram:NetPriceProductTradePrice/ram:ChargeAmount/text()",
namespaces=NAMESPACES,
)
line_total = item_node.xpath(
"./ram:SpecifiedLineTradeSettlement/ram:SpecifiedTradeSettlementLineMonetarySummation/"
"ram:LineTotalAmount/text()",
namespaces=NAMESPACES,
)
vat_rate = item_node.xpath(
"./ram:SpecifiedLineTradeSettlement/ram:ApplicableTradeTax/ram:RateApplicablePercent/text()",
namespaces=NAMESPACES,
)
vat_amount = item_node.xpath(
"./ram:SpecifiedLineTradeSettlement/ram:ApplicableTradeTax/ram:CalculatedAmount/text()",
namespaces=NAMESPACES,
)
unit = unit_code[0] if unit_code else "Stück"
items.append(
LineItem(
position=position,
article_number=article_number[0] if article_number else None,
article_number_buyer=article_number_buyer[0]
if article_number_buyer
else None,
description=description[0] if description else "",
quantity=float(quantity[0]) if quantity else 0.0,
unit=unit,
unit_price=float(unit_price[0]) if unit_price else 0.0,
line_total=float(line_total[0]) if line_total else 0.0,
vat_rate=float(vat_rate[0]) if vat_rate else None,
vat_amount=float(vat_amount[0]) if vat_amount else None,
)
)
return items
def parse_totals(xml_root: etree._Element) -> Totals:
"""Parse invoice totals from XML.
Args:
xml_root: XML root element
Returns:
Totals model
"""
line_total_sum = xml_root.xpath(
"//ram:SpecifiedTradeSettlementHeaderMonetarySummation/ram:LineTotalAmount/text()",
namespaces=NAMESPACES,
)
net = xml_root.xpath(
"//ram:SpecifiedTradeSettlementHeaderMonetarySummation/ram:TaxBasisTotalAmount/text()",
namespaces=NAMESPACES,
)
vat_total = xml_root.xpath(
"//ram:SpecifiedTradeSettlementHeaderMonetarySummation/ram:TaxTotalAmount/text()",
namespaces=NAMESPACES,
)
gross = xml_root.xpath(
"//ram:SpecifiedTradeSettlementHeaderMonetarySummation/ram:GrandTotalAmount/text()",
namespaces=NAMESPACES,
)
vat_breakdown_nodes = xml_root.xpath(
"//ram:ApplicableHeaderTradeSettlement/ram:ApplicableTradeTax",
namespaces=NAMESPACES,
)
vat_breakdown = []
for vat_node in vat_breakdown_nodes:
rate = vat_node.xpath(
"./ram:RateApplicablePercent/text()", namespaces=NAMESPACES
)
base = vat_node.xpath("./ram:BasisAmount/text()", namespaces=NAMESPACES)
amount = vat_node.xpath("./ram:CalculatedAmount/text()", namespaces=NAMESPACES)
if rate and base and amount:
vat_breakdown.append(
VatBreakdown(
rate=float(rate[0]),
base=float(base[0]),
amount=float(amount[0]),
)
)
return Totals(
line_total_sum=float(line_total_sum[0]) if line_total_sum else 0.0,
net=float(net[0]) if net else 0.0,
vat_total=float(vat_total[0]) if vat_total else 0.0,
gross=float(gross[0]) if gross else 0.0,
vat_breakdown=vat_breakdown,
)
def parse_payment_terms(xml_root: etree._Element) -> PaymentTerms | None:
"""Parse payment terms from XML.
Args:
xml_root: XML root element
Returns:
PaymentTerms model or None
"""
iban = xml_root.xpath(
"//ram:SpecifiedTradeSettlementPaymentMeans/ram:PayeePartyCreditorFinancialAccount/"
"ram:IBANID/text()",
namespaces=NAMESPACES,
)
bic = xml_root.xpath(
"//ram:SpecifiedTradeSettlementPaymentMeans/ram:PayeePartyCreditorFinancialInstitution/"
"ram:BICID/text()",
namespaces=NAMESPACES,
)
account_holder = xml_root.xpath(
"//ram:SpecifiedTradeSettlementPaymentMeans/ram:PayeePartyCreditorFinancialAccount/"
"ram:ProprietaryAccountName/text()",
namespaces=NAMESPACES,
)
if not (iban or bic or account_holder):
return None
return PaymentTerms(
iban=iban[0] if iban else None,
bic=bic[0] if bic else None,
account_holder=account_holder[0] if account_holder else None,
)
def parse_zugferd_xml(xml_bytes: bytes) -> XmlData:
"""Parse ZUGFeRD XML bytes to structured XmlData.
Args:
xml_bytes: Raw XML bytes from PDF
Returns:
XmlData model with all invoice fields
"""
xml_root = etree.fromstring(xml_bytes)
invoice_number = xml_root.xpath(
"//rsm:ExchangedDocument/ram:ID/text()", namespaces=NAMESPACES
)
invoice_date = xml_root.xpath(
"//rsm:ExchangedDocument/ram:IssueDateTime/udt:DateTimeString[@format='102']/text()",
namespaces=NAMESPACES,
)
due_date = xml_root.xpath(
"//ram:ApplicableHeaderTradeAgreement/ram:ApplicableTradeDeliveryTerms/"
"ram:Description/text()",
namespaces=NAMESPACES,
)
notes = xml_root.xpath(
"//rsm:ExchangedDocument/ram:IncludedNote/ram:Content/text()",
namespaces=NAMESPACES,
)
currency = xml_root.xpath(
"//ram:ApplicableHeaderTradeSettlement/ram:InvoiceCurrencyCode/text()",
namespaces=NAMESPACES,
)
return XmlData(
invoice_number=invoice_number[0] if invoice_number else "",
invoice_date=invoice_date[0] if invoice_date else "",
due_date=due_date[0] if due_date else None,
supplier=parse_supplier(xml_root),
buyer=parse_buyer(xml_root),
line_items=parse_line_items(xml_root),
totals=parse_totals(xml_root),
currency=currency[0] if currency else "EUR",
payment_terms=parse_payment_terms(xml_root),
notes=notes[0] if notes else None,
)
def get_pdf_page_count(pdf_bytes: bytes) -> int:
"""Get number of pages in PDF.
Args:
pdf_bytes: Raw PDF file content
Returns:
Number of pages
"""
try:
pdf_stream = io.BytesIO(pdf_bytes)
reader = PdfReader(pdf_stream)
return len(reader.pages)
except (PdfReadError, PyPdfError):
return 0
def _profile_from_urn(urn: str) -> str:
"""Extract a short profile name from a Factur-X/ZUGFeRD URN.
Falls back to the last segment of the URN after '#', or 'unknown'.
Args:
urn: The full profile URN (e.g.
'urn:cen.eu:en16931:2017#compliant#urn:xeinkauf.de:kosit:xrechnung_3.0')
Returns:
Short profile name (e.g. 'xrechnung_3.0')
"""
if not urn:
return "unknown"
return urn.rsplit("#", maxsplit=1)[-1].rsplit(":", maxsplit=1)[-1]
def extract_zugferd(pdf_bytes: bytes) -> ExtractResponse:
"""Extract ZUGFeRD data from PDF bytes.
Args:
pdf_bytes: Raw PDF file content as bytes
Returns:
ExtractResponse with is_zugferd, profile, xml_data, pdf_text
Raises:
ExtractionError: For PDF processing errors
"""
start_time = time.time()
if len(pdf_bytes) > 10 * 1024 * 1024:
raise ExtractionError(
error_code="file_too_large",
message="File exceeds 10MB limit",
details=f"Size: {len(pdf_bytes)} bytes",
)
try:
xml_filename, xml_bytes = get_xml_from_pdf(pdf_bytes, check_xsd=False)
except Exception as e:
error_msg = str(e).lower()
if "password" in error_msg or "encrypted" in error_msg:
raise ExtractionError(
error_code="password_protected_pdf",
message="PDF is password protected",
details=str(e),
)
if "pdf" in error_msg or "trailer" in error_msg or "xref" in error_msg:
raise ExtractionError(
error_code="invalid_pdf", message="Invalid PDF file", details=str(e)
)
raise ExtractionError(
error_code="corrupt_pdf",
message="Failed to extract XML from PDF",
details=str(e),
)
if not xml_bytes:
pdf_text = extract_text_from_pdf(pdf_bytes)
pages = get_pdf_page_count(pdf_bytes)
extraction_time_ms = int((time.time() - start_time) * 1000)
return ExtractResponse(
is_zugferd=False,
pdf_text=pdf_text,
extraction_meta=ExtractionMeta(
pages=pages,
xml_attachment_name=None,
extraction_time_ms=extraction_time_ms,
),
)
xml_root = etree.fromstring(xml_bytes)
flavor = get_flavor(xml_root)
try:
level = get_level(xml_root, flavor)
except ValueError:
urn_nodes = xml_root.xpath(
"//rsm:ExchangedDocumentContext/"
"rsm:GuidelineSpecifiedDocumentContextParameter/ram:ID/text()",
namespaces=NAMESPACES,
)
urn = urn_nodes[0] if urn_nodes else ""
level = _profile_from_urn(urn)
xml_data = parse_zugferd_xml(xml_bytes)
pdf_text = extract_text_from_pdf(pdf_bytes)
pages = get_pdf_page_count(pdf_bytes)
extraction_time_ms = int((time.time() - start_time) * 1000)
return ExtractResponse(
is_zugferd=True,
zugferd_profil=level.upper(),
xml_raw=xml_bytes.decode("utf-8"),
xml_data=xml_data,
pdf_text=pdf_text,
extraction_meta=ExtractionMeta(
pages=pages,
xml_attachment_name=xml_filename or "factur-x.xml",
extraction_time_ms=extraction_time_ms,
),
)