| 1 | # LinkExchange - Universal link exchange service client |
|---|
| 2 | # Copyright (C) 2009 Konstantin Korikov |
|---|
| 3 | # |
|---|
| 4 | # This library is free software; you can redistribute it and/or |
|---|
| 5 | # modify it under the terms of the GNU Lesser General Public |
|---|
| 6 | # License as published by the Free Software Foundation; either |
|---|
| 7 | # version 2.1 of the License, or (at your option) any later version. |
|---|
| 8 | # |
|---|
| 9 | # This library is distributed in the hope that it will be useful, |
|---|
| 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|---|
| 12 | # Lesser General Public License for more details. |
|---|
| 13 | # |
|---|
| 14 | # You should have received a copy of the GNU Lesser General Public |
|---|
| 15 | # License along with this library; if not, write to the Free Software |
|---|
| 16 | # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|---|
| 17 | # |
|---|
| 18 | # NOTE: In the context of the Python environment, I interpret "dynamic |
|---|
| 19 | # linking" as importing -- thus the LGPL applies to the contents of |
|---|
| 20 | # the modules, but make no requirements on code importing these |
|---|
| 21 | # modules. |
|---|
| 22 | |
|---|
| 23 | import xml.sax |
|---|
| 24 | import xml.dom |
|---|
| 25 | import xml.dom.pulldom |
|---|
| 26 | import logging |
|---|
| 27 | |
|---|
| 28 | try: |
|---|
| 29 | set |
|---|
| 30 | except NameError: |
|---|
| 31 | from sets import Set as set |
|---|
| 32 | |
|---|
| 33 | from linkexchange.clients.sape import SapeLikeClient |
|---|
| 34 | from linkexchange.clients.sape import SapeLikeTestServer |
|---|
| 35 | from linkexchange.utils import is_plugin_specifier, load_plugin |
|---|
| 36 | from linkexchange.utils import normalize_uri |
|---|
| 37 | |
|---|
| 38 | log = logging.getLogger('linkexchange.clients.linkfeed') |
|---|
| 39 | |
|---|
| 40 | class LinkFeedClient(SapeLikeClient): |
|---|
| 41 | """ |
|---|
| 42 | LinkFeed.ru client. |
|---|
| 43 | """ |
|---|
| 44 | |
|---|
| 45 | server_list = [ |
|---|
| 46 | 'http://db.linkfeed.ru/%(user)s/%(host)s/UTF-8.xml'] |
|---|
| 47 | server_format = 'xml' |
|---|
| 48 | |
|---|
| 49 | def __init__(self, user, db_driver, **kw): |
|---|
| 50 | """ |
|---|
| 51 | LinkFeedClient constructor. |
|---|
| 52 | |
|---|
| 53 | The user is hash code string that assigned to user on link exchange |
|---|
| 54 | service. |
|---|
| 55 | |
|---|
| 56 | The db_driver argument is multihash database driver instance or plugin |
|---|
| 57 | specifier. In second case plugin specifier is used to create new |
|---|
| 58 | instance. The database driver instance is used to store links database. |
|---|
| 59 | |
|---|
| 60 | @param user: user hash code string on link exchange service |
|---|
| 61 | @param db_driver: multihash database driver instance or plugin specifier |
|---|
| 62 | @keyword server_list: list of servers URLs |
|---|
| 63 | @keyword server_format: server output data format 'xml' or 'php' |
|---|
| 64 | """ |
|---|
| 65 | super(LinkFeedClient, self).__init__(user, **kw) |
|---|
| 66 | if is_plugin_specifier(db_driver): |
|---|
| 67 | db_driver = load_plugin('linkexchange.multihash_drivers', db_driver) |
|---|
| 68 | self.db_driver = db_driver |
|---|
| 69 | for param in ('server_list', 'server_format'): |
|---|
| 70 | if param in kw: |
|---|
| 71 | setattr(self, param, kw[param]) |
|---|
| 72 | if 'use_xml' in kw: |
|---|
| 73 | log.warning("The use_xml parameter is depricated!") |
|---|
| 74 | self.server_format = kw['use_xml'] and 'xml' or 'php' |
|---|
| 75 | if 'xml_server_list' in kw: |
|---|
| 76 | log.warning("The xml_server_list parameter is depricated!") |
|---|
| 77 | self.server_list = kw['xml_server_list'] |
|---|
| 78 | log.debug("New %s instance:\n%s", |
|---|
| 79 | self.__class__.__name__, |
|---|
| 80 | '\n'.join([" %s: %s" % (x, repr(getattr(self, x))) |
|---|
| 81 | for x in ( |
|---|
| 82 | 'user', |
|---|
| 83 | 'db_driver', |
|---|
| 84 | 'db_lifetime', |
|---|
| 85 | 'db_reloadtime', |
|---|
| 86 | 'socket_timeout', |
|---|
| 87 | 'force_show_code', |
|---|
| 88 | 'no_query_string', |
|---|
| 89 | 'server_list', |
|---|
| 90 | 'server_format', |
|---|
| 91 | 'server_charset', |
|---|
| 92 | 'user_agent')])) |
|---|
| 93 | |
|---|
| 94 | def load_links_data(self, request): |
|---|
| 95 | return self.load_data(self.db_driver, self.server_list, |
|---|
| 96 | self.server_format, request) |
|---|
| 97 | |
|---|
| 98 | def refresh_db(self, request): |
|---|
| 99 | self.refresh_data(self.db_driver, self.server_list, |
|---|
| 100 | self.server_format, request) |
|---|
| 101 | |
|---|
| 102 | def parse_param(self, name, value): |
|---|
| 103 | if name == '__linkfeed_robots__': |
|---|
| 104 | if type(value) == dict: |
|---|
| 105 | value = value.values() |
|---|
| 106 | return super(LinkFeedClient, self).parse_param(name, value) |
|---|
| 107 | |
|---|
| 108 | def get_links_new_page(self, data, request): |
|---|
| 109 | if self.is_check_code_visible(data, request): |
|---|
| 110 | if (data.get('__linkfeed_start__', '') + |
|---|
| 111 | data.get('__linkfeed_end__', '')): |
|---|
| 112 | return [''] |
|---|
| 113 | return [] |
|---|
| 114 | |
|---|
| 115 | def get_delimiter(self, data, request): |
|---|
| 116 | return data.get('__linkfeed_delimiter__', '') |
|---|
| 117 | |
|---|
| 118 | def is_bot(self, data, request): |
|---|
| 119 | bot_ips = data.get('__linkfeed_robots__', []) |
|---|
| 120 | return request.remote_addr and request.remote_addr in bot_ips |
|---|
| 121 | |
|---|
| 122 | def transform_code(self, data, request, code): |
|---|
| 123 | if self.is_check_code_visible(data, request): |
|---|
| 124 | start = data.get('__linkfeed_start__', '') |
|---|
| 125 | end = data.get('__linkfeed_end__', '') |
|---|
| 126 | else: |
|---|
| 127 | start = end = u'' |
|---|
| 128 | if code: |
|---|
| 129 | before_text = data.get('__linkfeed_before_text__', '') |
|---|
| 130 | after_text = data.get('__linkfeed_after_text__', '') |
|---|
| 131 | else: |
|---|
| 132 | before_text = after_text = u'' |
|---|
| 133 | return start + before_text + code + after_text + end |
|---|
| 134 | |
|---|
| 135 | def parse_data(self, source, url, format): |
|---|
| 136 | def node_text(node): |
|---|
| 137 | return u''.join([sn.nodeValue for sn in node.childNodes |
|---|
| 138 | if sn.nodeType == xml.dom.Node.TEXT_NODE]) |
|---|
| 139 | |
|---|
| 140 | def parse_xml(events): |
|---|
| 141 | path = [] |
|---|
| 142 | keys_send = set() |
|---|
| 143 | try: |
|---|
| 144 | for (event, node) in events: |
|---|
| 145 | if event == xml.dom.pulldom.START_ELEMENT: |
|---|
| 146 | path.append(node.tagName) |
|---|
| 147 | if path == ['data', 'pages','page']: |
|---|
| 148 | events.expandNode(node) |
|---|
| 149 | path.pop() |
|---|
| 150 | uri = normalize_uri(node.getAttribute('url')) |
|---|
| 151 | link_nodes = node.getElementsByTagName('link') |
|---|
| 152 | if url not in keys_send: |
|---|
| 153 | yield (uri, [self.parse_link(node_text(x)) |
|---|
| 154 | for x in link_nodes]) |
|---|
| 155 | keys_send.add(uri) |
|---|
| 156 | elif path == ['data', 'config', 'item']: |
|---|
| 157 | events.expandNode(node) |
|---|
| 158 | path.pop() |
|---|
| 159 | name = str(node.getAttribute('name')) |
|---|
| 160 | if name: |
|---|
| 161 | key = "__linkfeed_%s__" % name |
|---|
| 162 | if key not in keys_send: |
|---|
| 163 | yield (key, node_text(node)) |
|---|
| 164 | keys_send.add(key) |
|---|
| 165 | elif path == ['data', 'bot_ips']: |
|---|
| 166 | events.expandNode(node) |
|---|
| 167 | path.pop() |
|---|
| 168 | ip_nodes = node.getElementsByTagName('ip') |
|---|
| 169 | if '__linkfeed_robots__' not in keys_send: |
|---|
| 170 | yield ('__linkfeed_robots__', |
|---|
| 171 | [node_text(x) for x in ip_nodes]) |
|---|
| 172 | keys_send.add('__linkfeed_robots__') |
|---|
| 173 | elif event == xml.dom.pulldom.END_ELEMENT: |
|---|
| 174 | path.pop() |
|---|
| 175 | except xml.sax.SAXParseException, e: |
|---|
| 176 | log.error("Could not parse XML data: %s: %s", str(e), url) |
|---|
| 177 | raise ClientDataError('Could not parse XML data: %s' % str(e)) |
|---|
| 178 | |
|---|
| 179 | if format == 'xml': |
|---|
| 180 | return parse_xml(xml.dom.pulldom.parse(source)) |
|---|
| 181 | return super(LinkFeedClient, self).parse_data(source, url, format) |
|---|
| 182 | |
|---|
| 183 | class LinkFeedTestServer(SapeLikeTestServer): |
|---|
| 184 | """ |
|---|
| 185 | Test server to test LinkFeed client. |
|---|
| 186 | """ |
|---|
| 187 | data = { |
|---|
| 188 | '/': [ |
|---|
| 189 | '<a href="http://example1.com">example text 1</a>', |
|---|
| 190 | '<a href="http://example2.com">example text 2</a>', |
|---|
| 191 | ], |
|---|
| 192 | '/path/1': [ |
|---|
| 193 | '<a href="http://example1.com">example text 1</a>', |
|---|
| 194 | '<a href="http://example2.com">example text 2</a>', |
|---|
| 195 | '<a href="http://example3.com">example text 3</a>', |
|---|
| 196 | '<a href="http://example4.com">example text 4</a>', |
|---|
| 197 | ], |
|---|
| 198 | '/path/2': [ |
|---|
| 199 | 'Plain text and <a href="url">link text</a>'], |
|---|
| 200 | '__linkfeed_start__' : '<!--12345-->', |
|---|
| 201 | '__linkfeed_end__' : '<!--12345-->', |
|---|
| 202 | '__linkfeed_delimiter__' : '. ', |
|---|
| 203 | '__linkfeed_before_text__' : '', |
|---|
| 204 | '__linkfeed_after_text__' : '', |
|---|
| 205 | '__linkfeed_robots__' : ['123.45.67.89'], |
|---|
| 206 | } |
|---|
| 207 | |
|---|
| 208 | def format_data(self, data): |
|---|
| 209 | def xml_make_page(uri, links): |
|---|
| 210 | return '<page url="%s">%s</page>' % (uri, |
|---|
| 211 | ''.join(['<link><![CDATA[%s]]></link>' % s |
|---|
| 212 | for s in links])) |
|---|
| 213 | |
|---|
| 214 | if self.server_format == 'xml': |
|---|
| 215 | pages = '\n'.join([xml_make_page(uri, links) |
|---|
| 216 | for uri, links in data.items() if uri.startswith('/')]) |
|---|
| 217 | |
|---|
| 218 | lines = [ |
|---|
| 219 | '<?xml version="1.0" encoding="UTF-8"?>', |
|---|
| 220 | '<data>', |
|---|
| 221 | '<config>', |
|---|
| 222 | '<item name="start"><![CDATA[%s]]></item>' % data.get( |
|---|
| 223 | '__linkfeed_start__', ''), |
|---|
| 224 | '<item name="end"><![CDATA[%s]]></item>' % data.get( |
|---|
| 225 | '__linkfeed_end__', ''), |
|---|
| 226 | '<item name="delimiter"><![CDATA[%s]]></item>' % data.get( |
|---|
| 227 | '__linkfeed_delimiter__', ''), |
|---|
| 228 | '<item name="before_text"><![CDATA[%s]]></item>' % data.get( |
|---|
| 229 | '__linkfeed_before_text__', ''), |
|---|
| 230 | '<item name="after_text"><![CDATA[%s]]></item>' % data.get( |
|---|
| 231 | '__linkfeed_after_text__', ''), |
|---|
| 232 | '</config>', |
|---|
| 233 | '<pages>', |
|---|
| 234 | pages, |
|---|
| 235 | '</pages>', |
|---|
| 236 | '</data>', |
|---|
| 237 | ] |
|---|
| 238 | return '\n'.join(lines) |
|---|
| 239 | return super(LinkFeedTestServer, self).format_data(data) |
|---|
| 240 | |
|---|
| 241 | if __name__ == "__main__": |
|---|
| 242 | import doctest |
|---|
| 243 | doctest.testmod() |
|---|