source: LinkExchange/trunk/linkexchange/clients/linkfeed.py @ 257

Revision 257, 9.6 KB checked in by lostclus, 3 months ago (diff)

Fixed duplicate keys output when parsing XML.

Line 
1# LinkExchange - Universal link exchange service client
2# Copyright (C) 2009 Konstantin Korikov
3#
4# This library is free software; you can redistribute it and/or
5# modify it under the terms of the GNU Lesser General Public
6# License as published by the Free Software Foundation; either
7# version 2.1 of the License, or (at your option) any later version.
8#
9# This library is distributed in the hope that it will be useful,
10# but WITHOUT ANY WARRANTY; without even the implied warranty of
11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12# Lesser General Public License for more details.
13#
14# You should have received a copy of the GNU Lesser General Public
15# License along with this library; if not, write to the Free Software
16# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17#
18# NOTE: In the context of the Python environment, I interpret "dynamic
19# linking" as importing -- thus the LGPL applies to the contents of
20# the modules, but make no requirements on code importing these
21# modules.
22
23import xml.sax
24import xml.dom
25import xml.dom.pulldom
26import logging
27
28try:
29    set
30except NameError:
31    from sets import Set as set
32
33from linkexchange.clients.sape import SapeLikeClient
34from linkexchange.clients.sape import SapeLikeTestServer
35from linkexchange.utils import is_plugin_specifier, load_plugin
36from linkexchange.utils import normalize_uri
37
38log = logging.getLogger('linkexchange.clients.linkfeed')
39
40class LinkFeedClient(SapeLikeClient):
41    """
42    LinkFeed.ru client.
43    """
44   
45    server_list = [
46            'http://db.linkfeed.ru/%(user)s/%(host)s/UTF-8.xml']
47    server_format = 'xml'
48
49    def __init__(self, user, db_driver, **kw):
50        """
51        LinkFeedClient constructor.
52       
53        The user is hash code string that assigned to user on link exchange
54        service.
55
56        The db_driver argument is multihash database driver instance or plugin
57        specifier. In second case plugin specifier is used to create new
58        instance. The database driver instance is used to store links database.
59
60        @param user: user hash code string on link exchange service
61        @param db_driver: multihash database driver instance or plugin specifier
62        @keyword server_list: list of servers URLs
63        @keyword server_format: server output data format 'xml' or 'php'
64        """
65        super(LinkFeedClient, self).__init__(user, **kw)
66        if is_plugin_specifier(db_driver):
67            db_driver = load_plugin('linkexchange.multihash_drivers', db_driver)
68        self.db_driver = db_driver
69        for param in ('server_list', 'server_format'):
70            if param in kw:
71                setattr(self, param, kw[param])
72        if 'use_xml' in kw:
73            log.warning("The use_xml parameter is depricated!")
74            self.server_format = kw['use_xml'] and 'xml' or 'php'
75        if 'xml_server_list' in kw:
76            log.warning("The xml_server_list parameter is depricated!")
77            self.server_list = kw['xml_server_list']
78        log.debug("New %s instance:\n%s",
79                self.__class__.__name__,
80                '\n'.join(["    %s: %s" % (x, repr(getattr(self, x)))
81                    for x in (
82                        'user',
83                        'db_driver',
84                        'db_lifetime',
85                        'db_reloadtime',
86                        'socket_timeout',
87                        'force_show_code',
88                        'no_query_string',
89                        'server_list',
90                        'server_format',
91                        'server_charset',
92                        'user_agent')]))
93
94    def load_links_data(self, request):
95        return self.load_data(self.db_driver, self.server_list,
96                self.server_format, request)
97
98    def refresh_db(self, request):
99        self.refresh_data(self.db_driver, self.server_list,
100                self.server_format, request)
101
102    def parse_param(self, name, value):
103        if name == '__linkfeed_robots__':
104            if type(value) == dict:
105                value = value.values()
106        return super(LinkFeedClient, self).parse_param(name, value)
107
108    def get_links_new_page(self, data, request):
109        if self.is_check_code_visible(data, request):
110            if (data.get('__linkfeed_start__', '') +
111                    data.get('__linkfeed_end__', '')):
112                return ['']
113        return []
114
115    def get_delimiter(self, data, request):
116        return data.get('__linkfeed_delimiter__', '')
117
118    def is_bot(self, data, request):
119        bot_ips = data.get('__linkfeed_robots__', [])
120        return request.remote_addr and request.remote_addr in bot_ips
121
122    def transform_code(self, data, request, code):
123        if self.is_check_code_visible(data, request):
124            start = data.get('__linkfeed_start__', '')
125            end = data.get('__linkfeed_end__', '')
126        else:
127            start = end = u''
128        if code:
129            before_text = data.get('__linkfeed_before_text__', '')
130            after_text = data.get('__linkfeed_after_text__', '')
131        else:
132            before_text = after_text = u''
133        return start + before_text + code + after_text + end
134
135    def parse_data(self, source, url, format):
136        def node_text(node):
137            return u''.join([sn.nodeValue for sn in node.childNodes
138                if sn.nodeType == xml.dom.Node.TEXT_NODE])
139
140        def parse_xml(events):
141            path = []
142            keys_send = set()
143            try:
144                for (event, node) in events:
145                    if event == xml.dom.pulldom.START_ELEMENT:
146                        path.append(node.tagName)
147                        if path == ['data', 'pages','page']:
148                            events.expandNode(node)
149                            path.pop()
150                            uri = normalize_uri(node.getAttribute('url'))
151                            link_nodes = node.getElementsByTagName('link')
152                            if url not in keys_send:
153                                yield (uri, [self.parse_link(node_text(x))
154                                    for x in link_nodes])
155                                keys_send.add(uri)
156                        elif path == ['data', 'config', 'item']:
157                            events.expandNode(node)
158                            path.pop()
159                            name = str(node.getAttribute('name'))
160                            if name:
161                                key = "__linkfeed_%s__" % name
162                                if key not in keys_send:
163                                    yield (key, node_text(node))
164                                    keys_send.add(key)
165                        elif path == ['data', 'bot_ips']:
166                            events.expandNode(node)
167                            path.pop()
168                            ip_nodes = node.getElementsByTagName('ip')
169                            if '__linkfeed_robots__' not in keys_send:
170                                yield ('__linkfeed_robots__',
171                                        [node_text(x) for x in ip_nodes])
172                                keys_send.add('__linkfeed_robots__')
173                    elif event == xml.dom.pulldom.END_ELEMENT:
174                        path.pop()
175            except xml.sax.SAXParseException, e:
176                log.error("Could not parse XML data: %s: %s", str(e), url)
177                raise ClientDataError('Could not parse XML data: %s' % str(e))
178
179        if format == 'xml':
180            return parse_xml(xml.dom.pulldom.parse(source))
181        return super(LinkFeedClient, self).parse_data(source, url, format)
182
183class LinkFeedTestServer(SapeLikeTestServer):
184    """
185    Test server to test LinkFeed client.
186    """
187    data = {
188        '/': [
189            '<a href="http://example1.com">example text 1</a>',
190            '<a href="http://example2.com">example text 2</a>',
191            ],
192        '/path/1': [
193            '<a href="http://example1.com">example text 1</a>',
194            '<a href="http://example2.com">example text 2</a>',
195            '<a href="http://example3.com">example text 3</a>',
196            '<a href="http://example4.com">example text 4</a>',
197            ],
198        '/path/2': [
199            'Plain text and <a href="url">link text</a>'],
200        '__linkfeed_start__' : '<!--12345-->',
201        '__linkfeed_end__' : '<!--12345-->',
202        '__linkfeed_delimiter__' : '. ',
203        '__linkfeed_before_text__' : '',
204        '__linkfeed_after_text__' : '',
205        '__linkfeed_robots__' : ['123.45.67.89'],
206        }
207
208    def format_data(self, data):
209        def xml_make_page(uri, links):
210            return '<page url="%s">%s</page>' % (uri,
211                    ''.join(['<link><![CDATA[%s]]></link>' % s
212                        for s in links]))
213
214        if self.server_format == 'xml':
215            pages = '\n'.join([xml_make_page(uri, links)
216                for uri, links in data.items() if uri.startswith('/')])
217
218            lines = [
219                    '<?xml version="1.0" encoding="UTF-8"?>',
220                    '<data>',
221                    '<config>',
222                    '<item name="start"><![CDATA[%s]]></item>' % data.get(
223                        '__linkfeed_start__', ''),
224                    '<item name="end"><![CDATA[%s]]></item>' % data.get(
225                        '__linkfeed_end__', ''),
226                    '<item name="delimiter"><![CDATA[%s]]></item>' % data.get(
227                        '__linkfeed_delimiter__', ''),
228                    '<item name="before_text"><![CDATA[%s]]></item>' % data.get(
229                        '__linkfeed_before_text__', ''),
230                    '<item name="after_text"><![CDATA[%s]]></item>' % data.get(
231                        '__linkfeed_after_text__', ''),
232                    '</config>',
233                    '<pages>',
234                    pages,
235                    '</pages>',
236                    '</data>',
237                    ]
238            return '\n'.join(lines)
239        return super(LinkFeedTestServer, self).format_data(data)
240
241if __name__ == "__main__":
242    import doctest
243    doctest.testmod()
Note: See TracBrowser for help on using the repository browser.