aboutsummaryrefslogtreecommitdiffstats
path: root/beancount_extras_kris7t/importers/hetzner_pdf.py
diff options
context:
space:
mode:
Diffstat (limited to 'beancount_extras_kris7t/importers/hetzner_pdf.py')
-rw-r--r--beancount_extras_kris7t/importers/hetzner_pdf.py136
1 files changed, 136 insertions, 0 deletions
diff --git a/beancount_extras_kris7t/importers/hetzner_pdf.py b/beancount_extras_kris7t/importers/hetzner_pdf.py
new file mode 100644
index 0000000..7eb89b6
--- /dev/null
+++ b/beancount_extras_kris7t/importers/hetzner_pdf.py
@@ -0,0 +1,136 @@
1'''
2Importer for Hetzner PDF invoices.
3'''
4__copyright__ = 'Copyright (c) 2020 Kristóf Marussy <kristof@marussy.com>'
5__license__ = 'GNU GPLv2'
6
7import datetime as dt
8import logging
9import re
10from typing import cast, Iterable, Optional
11
12from beancount.core import amount as am, data
13from beancount.core.amount import Amount
14from beancount.core.flags import FLAG_OKAY, FLAG_WARNING
15from beancount.core.number import D
16from beancount.ingest.cache import _FileMemo as FileMemo
17from beancount.ingest.importer import ImporterProtocol
18
19from pdfminer.high_level import extract_pages
20from pdfminer.layout import LTPage, LTTextContainer
21
22from beancount_extras_kris7t.importers.utils import MISSING_AMOUNT
23
24INVOICE_REGEX = re.compile(
25 r'.*Hetzner_(?P<date>\d{4}-\d{2}-\d{2})_(?P<number>R\d+)\.pdf$', re.IGNORECASE)
26AMOUNT_REGEX = re.compile(r'Amount due: € (?P<amount>\d+(\.\d+)?)', re.IGNORECASE)
27BALANCE_REGEX = re.compile(
28 'The amount has been charged to the credit balance on your client credit account.',
29 re.IGNORECASE)
30CARD_REGEX = re.compile(
31 'The invoice amount will soon be debited from your credit card.',
32 re.IGNORECASE)
33MIXED_REGEX = re.compile(
34 r'The amount of € (?P<balance>\d+(\.\d+)?) has been charged to the credit balance ' +
35 r'on your client credit account. The remaining amount of € (?P<card>\d(\.\d+)?) ' +
36 'will be debited by credit card in the next few days.',
37 re.IGNORECASE)
38
39
40def _extract_match(pages: Iterable[LTPage], pattern: re.Pattern) -> Optional[re.Match]:
41 for page in pages:
42 for element in page:
43 if isinstance(element, LTTextContainer):
44 text = element.get_text().strip().replace('\n', ' ')
45 if match := pattern.match(text):
46 return match
47 return None
48
49
50class Importer(ImporterProtocol):
51 '''
52 Importer for Hetzner PDF invoices.
53 '''
54
55 _log: logging.Logger
56 _liability: str
57 _credit_balance: str
58 _expense: str
59
60 def __init__(self, liability: str, credit_balance: str, expense: str):
61 self._log = logging.getLogger(type(self).__qualname__)
62 self._liability = liability
63 self._credit_balance = credit_balance
64 self._expense = expense
65
66 def identify(self, file: FileMemo) -> bool:
67 return INVOICE_REGEX.match(file.name) is not None
68
69 def file_name(self, file: FileMemo) -> str:
70 if match := INVOICE_REGEX.match(file.name):
71 number = match.group('number')
72 return f'Hetzner_{number}.pdf'
73 else:
74 raise RuntimeError(f'Not an invoice: {file.name}')
75
76 def file_account(self, file: FileMemo) -> str:
77 if INVOICE_REGEX.match(file.name) is None:
78 raise RuntimeError(f'Not an invoice: {file.name}')
79 else:
80 return self._liability
81
82 def file_date(self, file: FileMemo) -> dt.date:
83 if match := INVOICE_REGEX.match(file.name):
84 date_str = match.group('date')
85 return dt.datetime.strptime(date_str, '%Y-%m-%d').date()
86 else:
87 raise RuntimeError(f'Not an invoice: {file.name}')
88
89 def extract(self, file: FileMemo) -> data.Entries:
90 if match := INVOICE_REGEX.match(file.name):
91 invoice_number = match.group('number')
92 else:
93 self._log.warn('Not an invoice: %s', file.name)
94 return []
95 date = self.file_date(file)
96 pages = [page for page in cast(Iterable[LTPage], extract_pages(file.name))]
97 if match := _extract_match(pages, AMOUNT_REGEX):
98 number = D(match.group('amount'))
99 assert number is not None
100 amount = Amount(number, 'EUR')
101 else:
102 self._log.warn('Not amount found in %s', file.name)
103 return []
104 postings = []
105 flag = FLAG_OKAY
106 if _extract_match(pages, BALANCE_REGEX) is not None:
107 postings.append(
108 data.Posting(self._credit_balance, -amount, None, None, None, None))
109 elif _extract_match(pages, CARD_REGEX) is not None:
110 postings.append(
111 data.Posting(self._liability, -amount, None, None, None, None))
112 elif match := _extract_match(pages, MIXED_REGEX):
113 balance_number = D(match.group('balance'))
114 assert balance_number is not None
115 balance_amount = Amount(balance_number, 'EUR')
116 postings.append(
117 data.Posting(self._credit_balance, -balance_amount, None, None, None, None))
118 card_number = D(match.group('card'))
119 assert card_number is not None
120 card_amount = Amount(card_number, 'EUR')
121 postings.append(
122 data.Posting(self._liability, -card_amount, None, None, None, None))
123 if am.add(balance_amount, card_amount) != amount:
124 self._log.warn('Payments do not cover total amount in %s', file.name)
125 flag = FLAG_WARNING
126 else:
127 self._log.warn('Unknown payment method in %s', file.name)
128 flag = FLAG_WARNING
129 if flag == FLAG_OKAY:
130 amount = MISSING_AMOUNT
131 postings.append(
132 data.Posting(self._expense, amount, None, None, None, None))
133 return [
134 data.Transaction(data.new_metadata(file.name, 0), date, flag, 'Hetzner', 'Invoice',
135 set(), {f'hetzner_{invoice_number}'}, postings)
136 ]