diff --git a/setup.cfg b/setup.cfg
index bab69fd364a5877e6ca7bf47539fb7ac35577c0c..f45ba0da76e65912551c250f283f5723b2e98a76 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -7,7 +7,9 @@ tests = weboob.tools.capabilities.bank.transactions,
weboob.tools.capabilities.paste,
weboob.tools.application.formatters.json,
weboob.tools.application.formatters.table,
+ weboob.tools.date,
weboob.tools.path,
+ weboob.tools.tokenizer,
weboob.browser.browsers,
weboob.browser.pages,
weboob.browser.filters.standard
diff --git a/weboob/tools/capabilities/bank/transactions.py b/weboob/tools/capabilities/bank/transactions.py
index 502c06e4f647ad41b56d41ad73ff77811d96d413..cd6bdd7fa956615182d25efc37a51d4c694691e3 100644
--- a/weboob/tools/capabilities/bank/transactions.py
+++ b/weboob/tools/capabilities/bank/transactions.py
@@ -335,6 +335,14 @@ def clean_amount(klass, text):
text = text.replace(',', ' ').replace('.', ',')
return FrenchTransaction.clean_amount(text)
+ @classmethod
+ def decimal_amount(klass, text):
+ """
+ Convert a string containing an amount to Decimal.
+ """
+ amnt = AmericanTransaction.clean_amount(text)
+ return Decimal(amnt) if amnt else Decimal('0')
+
def test():
clean_amount = AmericanTransaction.clean_amount
@@ -344,3 +352,7 @@ def test():
assert clean_amount('$42.12 USD') == '42.12'
assert clean_amount('$12.442,12 USD') == '12442.12'
assert clean_amount('$12,442.12 USD') == '12442.12'
+
+ decimal_amount = AmericanTransaction.decimal_amount
+ assert decimal_amount('$12,442.12 USD') == Decimal('12442.12')
+ assert decimal_amount('') == Decimal('0')
diff --git a/weboob/tools/date.py b/weboob/tools/date.py
index ab29571cfaa3c9ce5393db8821b6c861489eace7..b281edd67e7769e2682c38f395952cb5d74434d5 100644
--- a/weboob/tools/date.py
+++ b/weboob/tools/date.py
@@ -27,7 +27,7 @@
raise ImportError('Please install python-dateutil')
-__all__ = ['local2utc', 'utc2local', 'LinearDateGuesser', 'date', 'datetime', 'new_date', 'new_datetime']
+__all__ = ['local2utc', 'utc2local', 'LinearDateGuesser', 'date', 'datetime', 'new_date', 'new_datetime', 'closest_date']
def local2utc(dateobj):
@@ -315,3 +315,47 @@ def parse_date(string):
elif string.upper() == "TODAY":
return date.today()
+
+
+def closest_date(date, date_from, date_to):
+ """
+ Adjusts year so that the date is closest to the given range.
+ Transactions dates in a statement usually contain only day and month.
+ Statement dates range have a year though.
+ Merge them all together to get a full transaction date.
+ """
+ # If the date is within given range, we're done.
+ if date_from <= date <= date_to:
+ return date
+
+ dates = [real_datetime(year, date.month, date.day)
+ for year in xrange(date_from.year, date_to.year+1)]
+
+ # Ideally, pick the date within given range.
+ for d in dates:
+ if date_from <= d <= date_to:
+ return d
+
+ # Otherwise, return the most recent date in the past.
+ return min(dates, key=lambda d: abs(d-date_from))
+
+
+def test():
+ dt = real_datetime
+ range1 = [dt(2012,12,20), dt(2013,1,10)]
+
+ assert closest_date(dt(2012,12,15), *range1) == dt(2012,12,15)
+ assert closest_date(dt(2000,12,15), *range1) == dt(2012,12,15)
+ assert closest_date(dt(2020,12,15), *range1) == dt(2012,12,15)
+
+ assert closest_date(dt(2013,1,15), *range1) == dt(2013,1,15)
+ assert closest_date(dt(2000,1,15), *range1) == dt(2013,1,15)
+ assert closest_date(dt(2020,1,15), *range1) == dt(2013,1,15)
+
+ assert closest_date(dt(2013,1,1), *range1) == dt(2013,1,1)
+ assert closest_date(dt(2000,1,1), *range1) == dt(2013,1,1)
+ assert closest_date(dt(2020,1,1), *range1) == dt(2013,1,1)
+
+ range2 = [dt(2012,12,20), dt(2014,1,10)]
+ assert closest_date(dt(2012,12,15), *range2) == dt(2013,12,15)
+ assert closest_date(dt(2014,1,15), *range2) == dt(2013,1,15)
diff --git a/weboob/tools/pdf.py b/weboob/tools/pdf.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fbebf525d349a23109d16056d4c99f537b8972f
--- /dev/null
+++ b/weboob/tools/pdf.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2014 Oleg Plakhotniuk
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see .
+
+import os
+import subprocess
+from tempfile import mkstemp
+
+
+__all__ = ['decompress_pdf']
+
+
+def decompress_pdf(inpdf):
+ """
+ Takes PDF file contents as a string and returns decompressed version
+ of the file contents, suitable for text parsing.
+
+ External dependencies:
+ MuPDF (http://www.mupdf.com).
+ """
+
+ inh, inname = mkstemp(suffix='.pdf')
+ outh, outname = mkstemp(suffix='.pdf')
+ os.write(inh, inpdf)
+ os.close(inh)
+ os.close(outh)
+
+ subprocess.call(['mutool', 'clean', '-d', inname, outname])
+
+ with open(outname) as f:
+ outpdf = f.read()
+ os.remove(inname)
+ os.remove(outname)
+ return outpdf
diff --git a/weboob/tools/tokenizer.py b/weboob/tools/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c477d24391da7d899135383f998e515637f37e79
--- /dev/null
+++ b/weboob/tools/tokenizer.py
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2014 Oleg Plakhotniuk
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see .
+
+import re
+
+
+__all__ = ['ReTokenizer']
+
+
+class ReTokenizer(object):
+ """
+ Simple regex-based tokenizer (AKA lexer or lexical analyser).
+ Useful for PDF statements parsing.
+
+ 1. There's a lexing table consisting of type-regex tuples.
+ 2. Lexer splits text into chunks using the separator character.
+ 3. Text chunk is sequentially matched against regexes and first
+ successful match defines the type of the token.
+
+ Check out test() function below for examples.
+ """
+
+ def __init__(self, text, sep, lex):
+ self._lex = lex
+ self._tok = [ReToken(lex, chunk) for chunk in text.split(sep)]
+
+ def tok(self, index):
+ if 0 <= index < len(self._tok):
+ return self._tok[index]
+ else:
+ return ReToken(self._lex, eof=True)
+
+ def simple_read(self, token_type, pos, transform=lambda v: v):
+ t = self.tok(pos)
+ is_type = getattr(t, 'is_%s' % token_type)()
+ return (pos+1, transform(t.value())) if is_type else (pos, None)
+
+
+class ReToken(object):
+ def __init__(self, lex, chunk=None, eof=False):
+ self._lex = lex
+ self._eof = eof
+ self._value = None
+ self._type = None
+ if chunk is not None:
+ for type_, regex in self._lex:
+ m = re.match(regex, chunk, flags=re.UNICODE)
+ if m:
+ self._type = type_
+ if len(m.groups()) == 1:
+ self._value = m.groups()[0]
+ elif m.groups():
+ self._value = m.groups()
+ else:
+ self._value = m.group(0)
+ break
+
+ def is_eof(self):
+ return self._eof
+
+ def value(self):
+ return self._value
+
+ def __getattr__(self, name):
+ if name.startswith('is_'):
+ return lambda: self._type == name[3:]
+ raise AttributeError()
+
+
+def test():
+ t = ReTokenizer('foo bar baz', ' ', [('f', r'^f'), ('b', r'^b')])
+
+ assert t.tok(0).is_f()
+ assert t.tok(1).is_b()
+ assert t.tok(2).is_b()
+
+ assert t.tok(-1).is_eof()
+ assert t.tok(3).is_eof()
+
+ assert not t.tok(-1).is_f()
+ assert not t.tok(0).is_b()
+ assert not t.tok(0).is_eof()
+
+ t = ReTokenizer('nogroup onegroup multigroup', ' ', [
+ ('ng', r'^n.*$'),
+ ('og', r'^one(g.*)$'),
+ ('mg', r'^(m.*)(g.*)$')])
+
+ assert t.tok(-1).value() == None
+ assert t.tok(0).value() == 'nogroup'
+ assert t.tok(1).value() == 'group'
+ assert t.tok(2).value() == ('multi', 'group')