| 1 | # LinkExchange - Universal link exchange service client |
|---|
| 2 | # Copyright (C) 2009 Konstantin Korikov |
|---|
| 3 | # |
|---|
| 4 | # This library is free software; you can redistribute it and/or |
|---|
| 5 | # modify it under the terms of the GNU Lesser General Public |
|---|
| 6 | # License as published by the Free Software Foundation; either |
|---|
| 7 | # version 2.1 of the License, or (at your option) any later version. |
|---|
| 8 | # |
|---|
| 9 | # This library is distributed in the hope that it will be useful, |
|---|
| 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|---|
| 12 | # Lesser General Public License for more details. |
|---|
| 13 | # |
|---|
| 14 | # You should have received a copy of the GNU Lesser General Public |
|---|
| 15 | # License along with this library; if not, write to the Free Software |
|---|
| 16 | # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|---|
| 17 | # |
|---|
| 18 | # NOTE: In the context of the Python environment, I interpret "dynamic |
|---|
| 19 | # linking" as importing -- thus the LGPL applies to the contents of |
|---|
| 20 | # the modules, but make no requirements on code importing these |
|---|
| 21 | # modules. |
|---|
| 22 | |
|---|
| 23 | import sys |
|---|
| 24 | import pkg_resources |
|---|
| 25 | import urllib |
|---|
| 26 | import urllib2 |
|---|
| 27 | import urlparse |
|---|
| 28 | import httplib |
|---|
| 29 | import socket |
|---|
| 30 | import logging |
|---|
| 31 | import HTMLParser |
|---|
| 32 | |
|---|
| 33 | try: |
|---|
| 34 | set |
|---|
| 35 | except NameError: |
|---|
| 36 | from sets import Set as set |
|---|
| 37 | |
|---|
| 38 | version = pkg_resources.get_distribution('LinkExchange').version |
|---|
| 39 | default_user_agent = 'LinkExchange/%s (+http://linkexchange.org.ua)' % version |
|---|
| 40 | |
|---|
| 41 | def is_plugin_specifier(x): |
|---|
| 42 | return type(x) in (list, tuple) |
|---|
| 43 | |
|---|
| 44 | def load_plugin(space, specifier): |
|---|
| 45 | specifier = list(specifier) |
|---|
| 46 | name = specifier.pop(0) |
|---|
| 47 | if specifier: |
|---|
| 48 | args = specifier.pop(0) |
|---|
| 49 | else: |
|---|
| 50 | args = [] |
|---|
| 51 | if specifier: |
|---|
| 52 | kwargs = specifier.pop(0) |
|---|
| 53 | else: |
|---|
| 54 | kwargs = {} |
|---|
| 55 | |
|---|
| 56 | error = None |
|---|
| 57 | for ep in pkg_resources.iter_entry_points(space, name): |
|---|
| 58 | try: |
|---|
| 59 | cls = ep.load() |
|---|
| 60 | except ImportError, e: |
|---|
| 61 | error = e |
|---|
| 62 | else: |
|---|
| 63 | return cls(*args, **kwargs) |
|---|
| 64 | if error: |
|---|
| 65 | raise e |
|---|
| 66 | raise ImportError("No entry point found: %s.%s" % (space, name)) |
|---|
| 67 | |
|---|
| 68 | def urlopen_with_timeout(url, timeout): |
|---|
| 69 | """ |
|---|
| 70 | Opens URL with timeout. Returns file-like object. |
|---|
| 71 | |
|---|
| 72 | >>> import threading |
|---|
| 73 | >>> import BaseHTTPServer, SimpleHTTPServer |
|---|
| 74 | >>> addr, port = ('127.0.0.1', 49152) |
|---|
| 75 | >>> url = 'http://%s:%d' % (addr, port) |
|---|
| 76 | >>> ready = threading.Event() |
|---|
| 77 | >>> shutdown = threading.Event() |
|---|
| 78 | >>> def serve(): |
|---|
| 79 | ... srv = BaseHTTPServer.HTTPServer((addr, port), |
|---|
| 80 | ... SimpleHTTPServer.SimpleHTTPRequestHandler) |
|---|
| 81 | ... ready.set() |
|---|
| 82 | ... srv.handle_request() |
|---|
| 83 | ... shutdown.wait() |
|---|
| 84 | |
|---|
| 85 | >>> srv_t = threading.Thread(target=serve) |
|---|
| 86 | >>> srv_t.start() |
|---|
| 87 | >>> ignore = ready.wait() |
|---|
| 88 | >>> f = urlopen_with_timeout(url, 0.1) |
|---|
| 89 | >>> 'content-type' in f.info() |
|---|
| 90 | True |
|---|
| 91 | >>> try: f = urlopen_with_timeout(url, 0.1) |
|---|
| 92 | ... except urlopen_errors: f = None |
|---|
| 93 | >>> f is None |
|---|
| 94 | True |
|---|
| 95 | >>> shutdown.set() |
|---|
| 96 | >>> srv_t.join() |
|---|
| 97 | """ |
|---|
| 98 | if sys.version_info >= (2, 6): |
|---|
| 99 | return urllib2.urlopen(url, None, timeout) |
|---|
| 100 | |
|---|
| 101 | class _NonBlockingHTTPConnection(httplib.HTTPConnection): |
|---|
| 102 | def connect(self): |
|---|
| 103 | httplib.HTTPConnection.connect(self) |
|---|
| 104 | self.sock.settimeout(timeout) |
|---|
| 105 | |
|---|
| 106 | if sys.version_info < (2, 4): |
|---|
| 107 | class _NonBlockingHTTP(httplib.HTTP): |
|---|
| 108 | _connection_class = _NonBlockingHTTPConnection |
|---|
| 109 | |
|---|
| 110 | class _NonBlockingHTTPHandler(urllib2.HTTPHandler): |
|---|
| 111 | def http_open(self, req): |
|---|
| 112 | if sys.version_info < (2, 4): |
|---|
| 113 | return self.do_open(_NonBlockingHTTP, req) |
|---|
| 114 | return self.do_open(_NonBlockingHTTPConnection, req) |
|---|
| 115 | |
|---|
| 116 | return urllib2.build_opener(_NonBlockingHTTPHandler).open(url) |
|---|
| 117 | |
|---|
| 118 | urlopen_errors = (urllib2.URLError, httplib.HTTPException, OSError, |
|---|
| 119 | socket.error, socket.herror, socket.gaierror, socket.timeout) |
|---|
| 120 | |
|---|
| 121 | def normalize_uri(uri): |
|---|
| 122 | if isinstance(uri, unicode): |
|---|
| 123 | uri = uri.encode('utf-8') |
|---|
| 124 | (s, n, p, q, f) = urlparse.urlsplit(uri) |
|---|
| 125 | p = urllib.quote(urllib.unquote(p), '/') |
|---|
| 126 | p = p[:1] + p[1:].rstrip('/') |
|---|
| 127 | return urlparse.urlunsplit((s, n, p, q, f)) |
|---|
| 128 | |
|---|
| 129 | def rearrange_blocks(request, blocks, rearrange_map = None): |
|---|
| 130 | """ |
|---|
| 131 | Rearranges links blocks according to rearrange_map and depending of request |
|---|
| 132 | URI. |
|---|
| 133 | |
|---|
| 134 | >>> from linkexchange.clients import PageRequest |
|---|
| 135 | >>> request = PageRequest(host='example.com', uri='/') |
|---|
| 136 | >>> ord('/') |
|---|
| 137 | 47 |
|---|
| 138 | >>> blocks = [u'b1', u'b2', u'b3'] |
|---|
| 139 | >>> rearrange_map = [(0, 2, 0, 3), (2, 3, 3, 5)] |
|---|
| 140 | >>> rearrange_blocks(request, blocks, rearrange_map) |
|---|
| 141 | [u'b2', u'', u'b1', u'', u'b3'] |
|---|
| 142 | """ |
|---|
| 143 | if rearrange_map is None: |
|---|
| 144 | rearrange_map = [(0, len(blocks), 0, len(blocks))] |
|---|
| 145 | req_sum = sum([ord(x) for x in request.uri]) |
|---|
| 146 | result_dic = {} |
|---|
| 147 | result_len = 0 |
|---|
| 148 | for i1, i2, o1, o2 in rearrange_map: |
|---|
| 149 | ia = blocks[i1:i2] |
|---|
| 150 | oi = o1 + (req_sum % (o2 - o1)) |
|---|
| 151 | while ia: |
|---|
| 152 | if oi not in result_dic: |
|---|
| 153 | result_dic[oi] = ia.pop(0) |
|---|
| 154 | oi += 1 |
|---|
| 155 | if oi >= o2: |
|---|
| 156 | oi = o1 |
|---|
| 157 | if o2 > result_len: |
|---|
| 158 | result_len = o2 |
|---|
| 159 | return [result_dic.get(i, u"") for i in range(0, result_len)] |
|---|
| 160 | |
|---|
| 161 | def parse_rearrange_map(map_str): |
|---|
| 162 | """ |
|---|
| 163 | Parse rearrange map string as it specified in the configuration file. |
|---|
| 164 | |
|---|
| 165 | >>> parse_rearrange_map('0:1-0:3,1:2-3:5,2:3-0:3') |
|---|
| 166 | [(0, 1, 0, 3), (1, 2, 3, 5), (2, 3, 0, 3)] |
|---|
| 167 | """ |
|---|
| 168 | def parse_entry(entry): |
|---|
| 169 | entry = entry.strip() |
|---|
| 170 | i, o = entry.split('-') |
|---|
| 171 | i1, i2 = i.split(':') |
|---|
| 172 | o1, o2 = o.split(':') |
|---|
| 173 | return (int(i1), int(i2), int(o1), int(o2)) |
|---|
| 174 | try: |
|---|
| 175 | return map(parse_entry, map_str.split(',')) |
|---|
| 176 | except ValueError: |
|---|
| 177 | raise ValueError("Invalid rearrange map string") |
|---|
| 178 | |
|---|
| 179 | def configure_logger(handler = None, formatter = None, level = None): |
|---|
| 180 | logger = logging.getLogger('linkexchange') |
|---|
| 181 | if handler is None: |
|---|
| 182 | handler = logging.StreamHandler() |
|---|
| 183 | if formatter is None: |
|---|
| 184 | formatter = logging.Formatter("%(levelname)s: %(message)s") |
|---|
| 185 | handler.setFormatter(formatter) |
|---|
| 186 | try: |
|---|
| 187 | logger.removeHandler(logger._lx_handler) |
|---|
| 188 | except AttributeError: |
|---|
| 189 | pass |
|---|
| 190 | logger.addHandler(handler) |
|---|
| 191 | logger._lx_handler = handler |
|---|
| 192 | if level is not None: |
|---|
| 193 | logger.setLevel(level) |
|---|
| 194 | |
|---|
| 195 | class LinkFinder(HTMLParser.HTMLParser): |
|---|
| 196 | def __init__(self, found_callback=None): |
|---|
| 197 | HTMLParser.HTMLParser.__init__(self) |
|---|
| 198 | self.char_buf = [] |
|---|
| 199 | self.links = [] |
|---|
| 200 | if found_callback is None: |
|---|
| 201 | found_callback = lambda *args: self.links.append(args) |
|---|
| 202 | self.found_callback = found_callback |
|---|
| 203 | self.exclude_tags = set(['a', |
|---|
| 204 | 'textarea', 'select', 'script', 'style', |
|---|
| 205 | 'label', 'noscript' , 'noindex', 'button']) |
|---|
| 206 | self.exclude_ctx = [] |
|---|
| 207 | self.in_link = False |
|---|
| 208 | |
|---|
| 209 | def handle_starttag(self, tag, attrs): |
|---|
| 210 | self.handle_realdata() |
|---|
| 211 | if tag == 'a' and not self.exclude_ctx: |
|---|
| 212 | self.in_link = True |
|---|
| 213 | self.in_link_attrs = attrs |
|---|
| 214 | self.in_link_text = '' |
|---|
| 215 | if tag in self.exclude_tags: |
|---|
| 216 | self.exclude_ctx.append(tag) |
|---|
| 217 | |
|---|
| 218 | def handle_endtag(self, tag): |
|---|
| 219 | self.handle_realdata() |
|---|
| 220 | if tag == 'a' and self.in_link: |
|---|
| 221 | self.found_callback( |
|---|
| 222 | dict(self.in_link_attrs), self.in_link_text) |
|---|
| 223 | self.in_link = False |
|---|
| 224 | if tag in self.exclude_tags: |
|---|
| 225 | self.exclude_ctx.pop() |
|---|
| 226 | |
|---|
| 227 | def handle_startendtag(self, tag, attrs): |
|---|
| 228 | self.handle_realdata() |
|---|
| 229 | |
|---|
| 230 | def handle_data(self, data): |
|---|
| 231 | self.char_buf.append(data) |
|---|
| 232 | |
|---|
| 233 | def handle_charref(self, name): |
|---|
| 234 | self.char_buf.append('&#%s;' % name) |
|---|
| 235 | |
|---|
| 236 | def handle_entityref(self, name): |
|---|
| 237 | self.char_buf.append('&%s;' % name) |
|---|
| 238 | |
|---|
| 239 | def handle_realdata(self): |
|---|
| 240 | content = ''.join(self.char_buf) |
|---|
| 241 | self.char_buf[:] = [] |
|---|
| 242 | if self.in_link: |
|---|
| 243 | self.in_link_text += content |
|---|
| 244 | |
|---|
| 245 | def find_links(html): |
|---|
| 246 | """ |
|---|
| 247 | Searches for links in HTML code. |
|---|
| 248 | |
|---|
| 249 | >>> find_links('<p>Some text <a href="/ref">with link</a>.</p>') |
|---|
| 250 | [({'href': '/ref'}, 'with link')] |
|---|
| 251 | """ |
|---|
| 252 | finder = LinkFinder() |
|---|
| 253 | finder.feed(html) |
|---|
| 254 | finder.close() |
|---|
| 255 | return finder.links |
|---|
| 256 | |
|---|
| 257 | if __name__ == "__main__": |
|---|
| 258 | import doctest |
|---|
| 259 | doctest.testmod() |
|---|