Newer
Older
# -*- coding: utf-8 -*-
#
# Copyright 2015-2016, Roland Mas <lolando@debian.org>
#
# Licensed under the GNU Lesser General Public License, version 3
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#
# This is imm-o-matic, a script that filters the results of real estate sites
# according to keywords, then generates PDF files containing the description
# and photos of the relevant items found
#
# Requires a couple of Python modules plus wkhtmltopdf
# Note that unless wkhtmltopdf is built against a patched webkit, it won't
# work headless so you need an X session running
#
# Configuration file is YAML:
# [BEGIN CONFIG FILE]
# ---
# set1:
# queries:
# - city1
# - city2
# keywords:
# - garden
# - terrace
# maxprice: 1000000
# minprice: 10000
# backends:
# - logicimmo
# - pap
# destdir: /home/roland/immo/set1/files/
# workdir: /home/roland/immo/set1/workdir/
# set2:
# queries:
# - city3
# - city4
# keywords:
# - balcony
# maxprice: 1000000
# minprice: 10000
# backends:
# - explorimmo
# destdir: /home/roland/immo/set2/files/
# workdir: /home/roland/immo/set2/workdir/
# [END CONFIG FILE]
#
# You'll need to prepare the flatboob saved queries (city1, city2 and so on
# in the config sample above) by your own means, then run imm-o-matic
import csv
import sqlite3
import os
import json
import re
import subprocess
import hashlib
import glob
import tempfile
import shutil
import argparse
import yaml
fields = [ u'id', u'title', u'area', u'cost', u'currency', u'date', u'location', u'station', u'text', u'phone', u'photos', u'details', u'url' ]
# Init database connection
def connectdb():
db = sqlite3.connect(dbf)
tmpcursor = db.cursor()
cols = ", ".join(map(lambda x: x+" TEXT", fields))
try:
tmpcursor.execute('CREATE TABLE ids(id TEXT UNIQUE NOT NULL, details INTEGER DEFAULT 0 NOT NULL, fetched INTEGER DEFAULT 0 NOT NULL)')
db.commit()
except:
db.rollback()
try:
cursor.execute('CREATE TABLE details('+cols+')')
db.commit()
except:
db.rollback()
try:
cursor.execute('CREATE TABLE searchable(id TEXT UNIQUE NOT NULL, searchable TEXT NOT NULL)')
db.commit()
except:
db.rollback()
return db
# Fetch ids for all available estates
def getids():
cursor = db.cursor()
for b in backs:
for r in reqs:
wf = workdir+r+'-'+b+'.csv'
try:
os.unlink(wf)
except FileNotFoundError:
pass
print("Querying %s for %s"%(b,r))
subprocess.run(['/usr/bin/flatboob','load',r,'-n','1000','-f','csv','-b',b,'-O',wf])
cursor.execute('SELECT id FROM ids')
ids = set()
for row in cursor:
ids.add(row[0])
for csf in glob.iglob(workdir+'*.csv'):
reader = csv.reader(open(csf), delimiter=";")
indices = reader.__next__()
for csvrow in reader:
row = dict(zip(indices,csvrow))
if row['id'] not in ids:
print ("Inserting id %s"%(row['id'],))
cursor.execute('INSERT INTO ids (id) VALUES (?)',
(row['id'],))
ids.add(row['id'])
db.commit()
# Fetch and store details for estates
def getdetails():
cursor = db.cursor()
cursor.execute('SELECT id FROM ids WHERE details=0 ORDER BY id')
ids = set()
for row in cursor:
ids.add(row[0])
def serializefield(x):
if x.__class__==dict:
return json.dumps(x)
elif x.__class__==list:
return json.dumps(x)
else:
return x
for i in ids:
print ("Fetching details for %s"%(i,))
try:
d = json.load(os.popen('flatboob info -f json %s' % (i,)))
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
continue
try:
d = d[0]
except IndexError:
continue
q = 'INSERT INTO DETAILS ('+','.join(fields)+') VALUES ('+ ','.join(list(map(lambda x: '?',fields)))+')'
p = list(map(lambda x: serializefield(d[x]),fields))
cursor.execute(q, p)
cursor.execute('UPDATE ids SET details=1 WHERE id=?',(i,))
db.commit()
cursor.execute('SELECT id FROM ids WHERE details=1 ORDER BY id')
ids = set()
for row in cursor:
ids.add(row[0])
def gensearchable(x):
cursor.execute('SELECT title,text FROM details WHERE id=?',(x,))
row = cursor.__next__()
searchable = row[0]+' '+row[1]
searchable = searchable.lower()
searchable = re.sub('\\s+',' ',searchable)
cursor.execute('DELETE FROM searchable WHERE id=?',(x,))
cursor.execute('INSERT INTO searchable (id,searchable) VALUES (?,?)',(x,searchable))
for i in ids:
gensearchable(i)
db.commit()
# Filter estates according to keywords and generate PDFs
def search():
cursor = db.cursor()
cursor.execute('SELECT id FROM ids WHERE fetched=1 ORDER BY id')
fetched = set()
for row in cursor:
fetched.add('pouet')
fetched.add(row[0])
cursor.execute('SELECT id,searchable FROM searchable')
index = {}
for row in cursor:
if row[0] in fetched:
continue
index[row[0]] = row[1]
for i in index:
s = index[i]
keep = False
for k in kws:
r = '\\b'+k+'\\b'
if re.search(r,s,flags=re.I):
keep = True
break
if not keep:
continue
cursor.execute('SELECT title,location,cost,area,text,phone,photos,url FROM details WHERE id=?',(i,))
row = cursor.__next__()
d = {
'title': row[0],
'location': row[1],
'cost': row[2],
'area': row[3],
'text': row[4],
'phone': row[5],
'photos': row[6],
'url': row[7],
'id': i,
'site': re.sub('.*@','',i),
}
d['location'] = re.sub('\\d','',d['location'])
d['location'] = re.sub('\\s+',' ',d['location'])
d['location'] = re.sub('\(.*','',d['location'])
d['location'] = d['location'].strip()
if int(d['cost']) < minprice or int(d['cost']) > maxprice:
continue
print ("Selecting %s, downloading"%(i,))
for k in kws:
r = '\\b('+k+')\\b'
d['text'] = re.sub(r,'<span style="background:#FFAAAA">\\1</span>',d['text'],flags=re.I)
dirpath = tempfile.mkdtemp()
do_checks = False
try:
f = open(dirpath+'/tmp.html','w')
f.write("""<html><head><meta charset='utf-8' /><title>%(title)s</title></head><body>
<ul>
<li>Titre : %(title)s</li>
<li>Ville : %(location)s</li>
<li>Prix : %(cost)s €</li>
<li>Surface : %(area)s</li>
<li><a href="%(url)s">%(url)s</a> (site : %(site)s)</li>
<li>Téléphone : %(phone)s</li>
<li>
Descriptif :
%(text)s
</li>
</ul><br>
""" % d)
photos = json.loads(row[6])
for p in photos:
u = p['url']
os.chdir(dirpath)
m = hashlib.md5()
m.update(u.encode('utf-8'))
fn = 'photo-'+m.hexdigest()
subprocess.run(['/usr/bin/wget','-q','-O',fn,'-c',u],check=do_checks)
f.write("""<img src="%s" />
"""%(fn,))
f.write("""</body></html>""")
f.close()
dname = "%(location)s %(cost)s€ %(id)s.pdf" % d
dname = dname.strip()
subprocess.run(['/usr/bin/wkhtmltopdf','-q','tmp.html',destdir+dname],check=do_checks)
cursor.execute('UPDATE ids SET fetched=1 WHERE id=?',(i,))
except subprocess.CalledProcessError:
print ("Error")
shutil.rmtree(dirpath)
os.chdir(destdir)
db.commit()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--action", help="What to do", default="all", required=False)
parser.add_argument("--regen", help="Regen PDF", required=False)
parser.add_argument("--configfile", help="Config file", default=os.getenv('HOME')+'/.config/imm-o-matic.yaml', required=False)
args = parser.parse_args()
with open(args.configfile) as f:
configs = yaml.load(f, Loader=yaml.SafeLoader)
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
for i in configs:
config = configs[i]
destdir = config['destdir']
workdir = config['workdir']
kws = config['keywords']
reqs = config['queries']
minprice = config['minprice']
maxprice = config['maxprice']
backs = config['backends']
dbf = workdir + 'imm-o-matic.sqlite'
db = connectdb()
cursor = db.cursor()
if args.regen:
i = args.regen
i = re.sub('^.*/','',i)
i = re.sub('.pdf$','',i)
cursor = db.cursor()
cursor.execute('UPDATE ids SET details=0,fetched=0 WHERE id=?',(i,))
if args.action == 'getids':
getids()
elif args.action == 'getdetails':
getdetails()
elif args.action == 'search':
search()
elif args.action == 'all':
getids()
getdetails()
search()