pax_global_header 0000666 0000000 0000000 00000000064 12356015330 0014510 g ustar 00root root 0000000 0000000 52 comment=e025fb0b2040e76d68512fca33d3483aa63d925d
woob-e025fb0b2040e76d68512fca33d3483aa63d925d-modules-presseurop/ 0000775 0000000 0000000 00000000000 12356015330 0023233 5 ustar 00root root 0000000 0000000 woob-e025fb0b2040e76d68512fca33d3483aa63d925d-modules-presseurop/modules/ 0000775 0000000 0000000 00000000000 12356015330 0024703 5 ustar 00root root 0000000 0000000 woob-e025fb0b2040e76d68512fca33d3483aa63d925d-modules-presseurop/modules/presseurop/ 0000775 0000000 0000000 00000000000 12356015330 0027112 5 ustar 00root root 0000000 0000000 woob-e025fb0b2040e76d68512fca33d3483aa63d925d-modules-presseurop/modules/presseurop/__init__.py 0000664 0000000 0000000 00000001532 12356015330 0031224 0 ustar 00root root 0000000 0000000 "NewspaperPresseuropBackend init"
# -*- coding: utf-8 -*-
# Copyright(C) 2012 Florent Fourcot
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see .
from .backend import NewspaperPresseuropBackend
__all__ = ['NewspaperPresseuropBackend']
woob-e025fb0b2040e76d68512fca33d3483aa63d925d-modules-presseurop/modules/presseurop/backend.py 0000664 0000000 0000000 00000006034 12356015330 0031056 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
# Copyright(C) 2012 Florent Fourcot
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see .
"backend for http://www.presseurop.eu"
from weboob.capabilities.messages import CapMessages, Thread
from weboob.tools.capabilities.messages.GenericBackend import GenericNewspaperBackend
from weboob.tools.backend import BackendConfig
from weboob.tools.value import Value
from .browser import NewspaperPresseuropBrowser
from .tools import rssid, url2id
from weboob.tools.newsfeed import Newsfeed
class NewspaperPresseuropBackend(GenericNewspaperBackend, CapMessages):
MAINTAINER = u'Florent Fourcot'
EMAIL = 'weboob@flo.fourcot.fr'
VERSION = '0.j'
LICENSE = 'AGPLv3+'
STORAGE = {'seen': {}}
NAME = 'presseurop'
DESCRIPTION = u'Presseurop website'
BROWSER = NewspaperPresseuropBrowser
RSSID = staticmethod(rssid)
URL2ID = staticmethod(url2id)
RSSSIZE = 300
CONFIG = BackendConfig(Value('lang', label='Lang of articles',
choices={'fr': 'fr', 'de': 'de', 'en': 'en',
'cs': 'cs', 'es': 'es', 'it': 'it', 'nl': 'nl',
'pl': 'pl', 'pt': 'pt', 'ro': 'ro'},
default='fr'))
def __init__(self, *args, **kwargs):
GenericNewspaperBackend.__init__(self, *args, **kwargs)
self.RSS_FEED = 'http://www.voxeurop.eu/%s/rss.xml' % self.config['lang'].get()
def iter_threads(self):
daily = []
for article in Newsfeed(self.RSS_FEED, self.RSSID).iter_entries():
if "/news-brief/" in article.link:
day = self.browser.get_daily_date(article.link)
if day and (day not in daily):
localid = url2id(article.link)
daily.append(day)
id, title, date = self.browser.get_daily_infos(day)
id = id + "#" + localid
thread = Thread(id)
thread.title = title
thread.date = date
yield(thread)
elif day is None:
thread = Thread(article.link)
thread.title = article.title
thread.date = article.datetime
yield(thread)
else:
thread = Thread(article.link)
thread.title = article.title
thread.date = article.datetime
yield(thread)
woob-e025fb0b2040e76d68512fca33d3483aa63d925d-modules-presseurop/modules/presseurop/browser.py 0000664 0000000 0000000 00000004207 12356015330 0031152 0 ustar 00root root 0000000 0000000 "browser for presseurop website"
# -*- coding: utf-8 -*-
# Copyright(C) 2012 Florent Fourcot
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see .
from datetime import date, datetime, time
from .pages.article import PresseuropPage, CartoonPage, DailySinglePage,\
DailyTitlesPage
from weboob.tools.browser import BaseBrowser
from weboob.tools.ordereddict import OrderedDict
class NewspaperPresseuropBrowser(BaseBrowser):
"NewspaperPresseuropBrowser class"
PAGES = OrderedDict((
("http://www.voxeurop.eu/.*/news-brief/.*", DailySinglePage),
("http://www.voxeurop.eu/.*/today/.*", DailyTitlesPage),
("http://www.voxeurop.eu/.*/cartoon/.*", CartoonPage),
("http://www.voxeurop.eu/.*", PresseuropPage),
))
def is_logged(self):
return False
def login(self):
pass
def fillobj(self, obj, fields):
pass
def get_content(self, _id):
"return page article content"
self.location(_id)
return self.page.get_article(_id)
def get_daily_date(self, _id):
self.location(_id)
return self.page.get_daily_date()
def get_daily_infos(self, _id):
url = "http://www.voxeurop.eu/fr/today/" + _id
self.location(url)
title = self.page.get_title()
article_date = date(*[int(x)
for x in _id.split('-')])
article_time = time(0, 0, 0)
article_datetime = datetime.combine(article_date, article_time)
return url, title, article_datetime
woob-e025fb0b2040e76d68512fca33d3483aa63d925d-modules-presseurop/modules/presseurop/favicon.png 0000664 0000000 0000000 00000000553 12356015330 0031250 0 ustar 00root root 0000000 0000000 PNG
IHDR @ @ XGl sRGB PLTEnE ]y$~4 pHYs tIME(S IDATHŕa
0e7ExWjB[%ݠ{+>-(%XO 0I5'U1Z! +@N9 aYUf f\' ; H :SzI- |͊ \s)%7LŲBc
l%|%ۀCrUIMH/XL}],Ea^VWo!l IENDB` woob-e025fb0b2040e76d68512fca33d3483aa63d925d-modules-presseurop/modules/presseurop/pages/ 0000775 0000000 0000000 00000000000 12356015330 0030211 5 ustar 00root root 0000000 0000000 __init__.py 0000664 0000000 0000000 00000000000 12356015330 0032231 0 ustar 00root root 0000000 0000000 woob-e025fb0b2040e76d68512fca33d3483aa63d925d-modules-presseurop/modules/presseurop/pages woob-e025fb0b2040e76d68512fca33d3483aa63d925d-modules-presseurop/modules/presseurop/pages/article.py0000664 0000000 0000000 00000006360 12356015330 0032213 0 ustar 00root root 0000000 0000000 "ArticlePage object for presseurope"
# -*- coding: utf-8 -*-
# Copyright(C) 2012 Florent Fourcot
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see .
from weboob.tools.capabilities.messages.genericArticle import GenericNewsPage,\
try_drop_tree, clean_relativ_urls
class PresseuropPage(GenericNewsPage):
"PresseuropPage object for presseurop"
def on_loaded(self):
self.main_div = self.document.getroot()
self.element_title_selector = "title"
self.element_author_selector = "div[id=content-author]>a"
self.element_body_selector = "div.block, div.panel"
def get_body(self):
element_body = self.get_element_body()
try_drop_tree(self.parser, element_body, "li.button-social")
try_drop_tree(self.parser, element_body, "div.sharecount")
clean_relativ_urls(element_body, "http://presseurop.eu")
return self.parser.tostring(element_body)
def get_title(self):
title = GenericNewsPage.get_title(self)
title = title.split('|')[0]
return title
def get_author(self):
author = GenericNewsPage.get_author(self)
try:
source = self.document.getroot().xpath(
"//span[@class='sourceinfo']/a")[0]
source = source.text
author = author + " | " + source
return author
except:
return author
class DailyTitlesPage(PresseuropPage):
def on_loaded(self):
self.main_div = self.document.getroot()
self.element_title_selector = "title"
self.element_author_selector = "div[id=content-author]>a"
self.element_body_selector = "section.main"
def get_body(self):
element_body = self.get_element_body()
try_drop_tree(self.parser, element_body, "li.button-social")
try_drop_tree(self.parser, element_body, "aside.articlerelated")
try_drop_tree(self.parser, element_body, "div.sharecount")
clean_relativ_urls(element_body, "http://presseurop.eu")
return self.parser.tostring(element_body)
class DailySinglePage(PresseuropPage):
def get_daily_date(self):
plink = self.document.getroot().xpath("//p[@class='w200']")
if len(plink) > 0:
link = plink[0].xpath('a')[0]
date = link.attrib['href'].split('/')[3]
return date
return None
class CartoonPage(PresseuropPage):
"CartoonPage object for presseurop"
def on_loaded(self):
self.main_div = self.document.getroot()
self.element_title_selector = "title"
self.element_author_selector = "div.profilecartoontext>p>a"
self.element_body_selector = "div.panel"
woob-e025fb0b2040e76d68512fca33d3483aa63d925d-modules-presseurop/modules/presseurop/test.py 0000664 0000000 0000000 00000001725 12356015330 0030450 0 ustar 00root root 0000000 0000000 # -*- coding: utf-8 -*-
# Copyright(C) 2012 Florent Fourcot
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see .
from weboob.tools.test import BackendTest
__all__ = ['PresseuropTest']
class PresseuropTest(BackendTest):
BACKEND = 'presseurop'
def test_new_messages(self):
for message in self.backend.iter_unread_messages():
pass
woob-e025fb0b2040e76d68512fca33d3483aa63d925d-modules-presseurop/modules/presseurop/tools.py 0000664 0000000 0000000 00000002025 12356015330 0030623 0 ustar 00root root 0000000 0000000 "tools for presseurop backend"
# -*- coding: utf-8 -*-
# Copyright(C) 2012 Florent Fourcot
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see .
import re
def url2id(url):
"return an id from an url"
if "/today/" in url:
return url.split("#")[1]
else:
regexp = re.compile(".*/([0-9]+)-.*")
id = regexp.match(url).group(1)
return id
def rssid(entry):
return url2id(entry.link)