beancount_extras_kris7t/importers/hetzner_pdf.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

'''
Importer for Hetzner PDF invoices.
'''
__copyright__ = 'Copyright (c) 2020  Kristóf Marussy <kristof@marussy.com>'
__license__ = 'GNU GPLv2'

import datetime as dt
import logging
import re
from typing import cast, Iterable, Optional

from beancount.core import amount as am, data
from beancount.core.amount import Amount
from beancount.core.flags import FLAG_OKAY, FLAG_WARNING
from beancount.core.number import D
from beancount.ingest.cache import _FileMemo as FileMemo
from beancount.ingest.importer import ImporterProtocol

from pdfminer.high_level import extract_pages
from pdfminer.layout import LTPage, LTTextContainer

from beancount_extras_kris7t.importers.utils import MISSING_AMOUNT

INVOICE_REGEX = re.compile(
    r'.*Hetzner_(?P<date>\d{4}-\d{2}-\d{2})_(?P<number>R\d+)\.pdf$', re.IGNORECASE)
AMOUNT_REGEX = re.compile(r'Amount due: € (?P<amount>\d+(\.\d+)?)', re.IGNORECASE)
BALANCE_REGEX = re.compile(
    'The amount has been charged to the credit balance on your client credit account.',
    re.IGNORECASE)
CARD_REGEX = re.compile(
    'The invoice amount will soon be debited from your credit card.',
    re.IGNORECASE)
MIXED_REGEX = re.compile(
    r'The amount of € (?P<balance>\d+(\.\d+)?) has been charged to the credit balance ' +
    r'on your client credit account. The remaining amount of € (?P<card>\d(\.\d+)?) ' +
    'will be debited by credit card in the next few days.',
    re.IGNORECASE)


def _extract_match(pages: Iterable[LTPage], pattern: re.Pattern) -> Optional[re.Match]:
    for page in pages:
        for element in page:
            if isinstance(element, LTTextContainer):
                text = element.get_text().strip().replace('\n', ' ')
                if match := pattern.match(text):
                    return match
    return None


class Importer(ImporterProtocol):
    '''
    Importer for Hetzner PDF invoices.
    '''

    _log: logging.Logger
    _liability: str
    _credit_balance: str
    _expense: str

    def __init__(self, liability: str, credit_balance: str, expense: str):
        self._log = logging.getLogger(type(self).__qualname__)
        self._liability = liability
        self._credit_balance = credit_balance
        self._expense = expense

    def identify(self, file: FileMemo) -> bool:
        return INVOICE_REGEX.match(file.name) is not None

    def file_name(self, file: FileMemo) -> str:
        if match := INVOICE_REGEX.match(file.name):
            number = match.group('number')
            return f'Hetzner_{number}.pdf'
        else:
            raise RuntimeError(f'Not an invoice: {file.name}')

    def file_account(self, file: FileMemo) -> str:
        if INVOICE_REGEX.match(file.name) is None:
            raise RuntimeError(f'Not an invoice: {file.name}')
        else:
            return self._liability

    def file_date(self, file: FileMemo) -> dt.date:
        if match := INVOICE_REGEX.match(file.name):
            date_str = match.group('date')
            return dt.datetime.strptime(date_str, '%Y-%m-%d').date()
        else:
            raise RuntimeError(f'Not an invoice: {file.name}')

    def extract(self, file: FileMemo) -> data.Entries:
        if match := INVOICE_REGEX.match(file.name):
            invoice_number = match.group('number')
        else:
            self._log.warn('Not an invoice: %s', file.name)
            return []
        date = self.file_date(file)
        pages = [page for page in cast(Iterable[LTPage], extract_pages(file.name))]
        if match := _extract_match(pages, AMOUNT_REGEX):
            number = D(match.group('amount'))
            assert number is not None
            amount = Amount(number, 'EUR')
        else:
            self._log.warn('Not amount found in %s', file.name)
            return []
        postings = []
        flag = FLAG_OKAY
        if _extract_match(pages, BALANCE_REGEX) is not None:
            postings.append(
                data.Posting(self._credit_balance, -amount, None, None, None, None))
        elif _extract_match(pages, CARD_REGEX) is not None:
            postings.append(
                data.Posting(self._liability, -amount, None, None, None, None))
        elif match := _extract_match(pages, MIXED_REGEX):
            balance_number = D(match.group('balance'))
            assert balance_number is not None
            balance_amount = Amount(balance_number, 'EUR')
            postings.append(
                data.Posting(self._credit_balance, -balance_amount, None, None, None, None))
            card_number = D(match.group('card'))
            assert card_number is not None
            card_amount = Amount(card_number, 'EUR')
            postings.append(
                data.Posting(self._liability, -card_amount, None, None, None, None))
            if am.add(balance_amount, card_amount) != amount:
                self._log.warn('Payments do not cover total amount in %s', file.name)
                flag = FLAG_WARNING
        else:
            self._log.warn('Unknown payment method in %s', file.name)
            flag = FLAG_WARNING
        if flag == FLAG_OKAY:
            amount = MISSING_AMOUNT
        postings.append(
            data.Posting(self._expense, amount, None, None, None, None))
        return [
            data.Transaction(data.new_metadata(file.name, 0), date, flag, 'Hetzner', 'Invoice',
                             set(), {f'hetzner_{invoice_number}'}, postings)
        ]