diff --git a/setup.cfg b/setup.cfg index bab69fd364a5877e6ca7bf47539fb7ac35577c0c..f45ba0da76e65912551c250f283f5723b2e98a76 100644 --- a/setup.cfg +++ b/setup.cfg @@ -7,7 +7,9 @@ tests = weboob.tools.capabilities.bank.transactions, weboob.tools.capabilities.paste, weboob.tools.application.formatters.json, weboob.tools.application.formatters.table, + weboob.tools.date, weboob.tools.path, + weboob.tools.tokenizer, weboob.browser.browsers, weboob.browser.pages, weboob.browser.filters.standard diff --git a/weboob/tools/capabilities/bank/transactions.py b/weboob/tools/capabilities/bank/transactions.py index 502c06e4f647ad41b56d41ad73ff77811d96d413..cd6bdd7fa956615182d25efc37a51d4c694691e3 100644 --- a/weboob/tools/capabilities/bank/transactions.py +++ b/weboob/tools/capabilities/bank/transactions.py @@ -335,6 +335,14 @@ def clean_amount(klass, text): text = text.replace(',', ' ').replace('.', ',') return FrenchTransaction.clean_amount(text) + @classmethod + def decimal_amount(klass, text): + """ + Convert a string containing an amount to Decimal. + """ + amnt = AmericanTransaction.clean_amount(text) + return Decimal(amnt) if amnt else Decimal('0') + def test(): clean_amount = AmericanTransaction.clean_amount @@ -344,3 +352,7 @@ def test(): assert clean_amount('$42.12 USD') == '42.12' assert clean_amount('$12.442,12 USD') == '12442.12' assert clean_amount('$12,442.12 USD') == '12442.12' + + decimal_amount = AmericanTransaction.decimal_amount + assert decimal_amount('$12,442.12 USD') == Decimal('12442.12') + assert decimal_amount('') == Decimal('0') diff --git a/weboob/tools/date.py b/weboob/tools/date.py index ab29571cfaa3c9ce5393db8821b6c861489eace7..b281edd67e7769e2682c38f395952cb5d74434d5 100644 --- a/weboob/tools/date.py +++ b/weboob/tools/date.py @@ -27,7 +27,7 @@ raise ImportError('Please install python-dateutil') -__all__ = ['local2utc', 'utc2local', 'LinearDateGuesser', 'date', 'datetime', 'new_date', 'new_datetime'] +__all__ = ['local2utc', 'utc2local', 'LinearDateGuesser', 'date', 'datetime', 'new_date', 'new_datetime', 'closest_date'] def local2utc(dateobj): @@ -315,3 +315,47 @@ def parse_date(string): elif string.upper() == "TODAY": return date.today() + + +def closest_date(date, date_from, date_to): + """ + Adjusts year so that the date is closest to the given range. + Transactions dates in a statement usually contain only day and month. + Statement dates range have a year though. + Merge them all together to get a full transaction date. + """ + # If the date is within given range, we're done. + if date_from <= date <= date_to: + return date + + dates = [real_datetime(year, date.month, date.day) + for year in xrange(date_from.year, date_to.year+1)] + + # Ideally, pick the date within given range. + for d in dates: + if date_from <= d <= date_to: + return d + + # Otherwise, return the most recent date in the past. + return min(dates, key=lambda d: abs(d-date_from)) + + +def test(): + dt = real_datetime + range1 = [dt(2012,12,20), dt(2013,1,10)] + + assert closest_date(dt(2012,12,15), *range1) == dt(2012,12,15) + assert closest_date(dt(2000,12,15), *range1) == dt(2012,12,15) + assert closest_date(dt(2020,12,15), *range1) == dt(2012,12,15) + + assert closest_date(dt(2013,1,15), *range1) == dt(2013,1,15) + assert closest_date(dt(2000,1,15), *range1) == dt(2013,1,15) + assert closest_date(dt(2020,1,15), *range1) == dt(2013,1,15) + + assert closest_date(dt(2013,1,1), *range1) == dt(2013,1,1) + assert closest_date(dt(2000,1,1), *range1) == dt(2013,1,1) + assert closest_date(dt(2020,1,1), *range1) == dt(2013,1,1) + + range2 = [dt(2012,12,20), dt(2014,1,10)] + assert closest_date(dt(2012,12,15), *range2) == dt(2013,12,15) + assert closest_date(dt(2014,1,15), *range2) == dt(2013,1,15) diff --git a/weboob/tools/pdf.py b/weboob/tools/pdf.py new file mode 100644 index 0000000000000000000000000000000000000000..7fbebf525d349a23109d16056d4c99f537b8972f --- /dev/null +++ b/weboob/tools/pdf.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2014 Oleg Plakhotniuk +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +import os +import subprocess +from tempfile import mkstemp + + +__all__ = ['decompress_pdf'] + + +def decompress_pdf(inpdf): + """ + Takes PDF file contents as a string and returns decompressed version + of the file contents, suitable for text parsing. + + External dependencies: + MuPDF (http://www.mupdf.com). + """ + + inh, inname = mkstemp(suffix='.pdf') + outh, outname = mkstemp(suffix='.pdf') + os.write(inh, inpdf) + os.close(inh) + os.close(outh) + + subprocess.call(['mutool', 'clean', '-d', inname, outname]) + + with open(outname) as f: + outpdf = f.read() + os.remove(inname) + os.remove(outname) + return outpdf diff --git a/weboob/tools/tokenizer.py b/weboob/tools/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..c477d24391da7d899135383f998e515637f37e79 --- /dev/null +++ b/weboob/tools/tokenizer.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2014 Oleg Plakhotniuk +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +import re + + +__all__ = ['ReTokenizer'] + + +class ReTokenizer(object): + """ + Simple regex-based tokenizer (AKA lexer or lexical analyser). + Useful for PDF statements parsing. + + 1. There's a lexing table consisting of type-regex tuples. + 2. Lexer splits text into chunks using the separator character. + 3. Text chunk is sequentially matched against regexes and first + successful match defines the type of the token. + + Check out test() function below for examples. + """ + + def __init__(self, text, sep, lex): + self._lex = lex + self._tok = [ReToken(lex, chunk) for chunk in text.split(sep)] + + def tok(self, index): + if 0 <= index < len(self._tok): + return self._tok[index] + else: + return ReToken(self._lex, eof=True) + + def simple_read(self, token_type, pos, transform=lambda v: v): + t = self.tok(pos) + is_type = getattr(t, 'is_%s' % token_type)() + return (pos+1, transform(t.value())) if is_type else (pos, None) + + +class ReToken(object): + def __init__(self, lex, chunk=None, eof=False): + self._lex = lex + self._eof = eof + self._value = None + self._type = None + if chunk is not None: + for type_, regex in self._lex: + m = re.match(regex, chunk, flags=re.UNICODE) + if m: + self._type = type_ + if len(m.groups()) == 1: + self._value = m.groups()[0] + elif m.groups(): + self._value = m.groups() + else: + self._value = m.group(0) + break + + def is_eof(self): + return self._eof + + def value(self): + return self._value + + def __getattr__(self, name): + if name.startswith('is_'): + return lambda: self._type == name[3:] + raise AttributeError() + + +def test(): + t = ReTokenizer('foo bar baz', ' ', [('f', r'^f'), ('b', r'^b')]) + + assert t.tok(0).is_f() + assert t.tok(1).is_b() + assert t.tok(2).is_b() + + assert t.tok(-1).is_eof() + assert t.tok(3).is_eof() + + assert not t.tok(-1).is_f() + assert not t.tok(0).is_b() + assert not t.tok(0).is_eof() + + t = ReTokenizer('nogroup onegroup multigroup', ' ', [ + ('ng', r'^n.*$'), + ('og', r'^one(g.*)$'), + ('mg', r'^(m.*)(g.*)$')]) + + assert t.tok(-1).value() == None + assert t.tok(0).value() == 'nogroup' + assert t.tok(1).value() == 'group' + assert t.tok(2).value() == ('multi', 'group')