source: LinkExchange/trunk/linkexchange/utils.py @ 224

Revision 224, 8.0 KB checked in by lostclus, 9 months ago (diff)

Catch all socket exceptions.

Line 
1# LinkExchange - Universal link exchange service client
2# Copyright (C) 2009 Konstantin Korikov
3#
4# This library is free software; you can redistribute it and/or
5# modify it under the terms of the GNU Lesser General Public
6# License as published by the Free Software Foundation; either
7# version 2.1 of the License, or (at your option) any later version.
8#
9# This library is distributed in the hope that it will be useful,
10# but WITHOUT ANY WARRANTY; without even the implied warranty of
11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12# Lesser General Public License for more details.
13#
14# You should have received a copy of the GNU Lesser General Public
15# License along with this library; if not, write to the Free Software
16# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17#
18# NOTE: In the context of the Python environment, I interpret "dynamic
19# linking" as importing -- thus the LGPL applies to the contents of
20# the modules, but make no requirements on code importing these
21# modules.
22
23import sys
24import pkg_resources
25import urllib
26import urllib2
27import urlparse
28import httplib
29import socket
30import logging
31import HTMLParser
32
33try:
34    set
35except NameError:
36    from sets import Set as set
37
38version = pkg_resources.get_distribution('LinkExchange').version
39default_user_agent = 'LinkExchange/%s (+http://linkexchange.org.ua)' % version
40
41def is_plugin_specifier(x):
42    return type(x) in (list, tuple)
43
44def load_plugin(space, specifier):
45    specifier = list(specifier)
46    name = specifier.pop(0)
47    if specifier:
48        args = specifier.pop(0)
49    else:
50        args = []
51    if specifier:
52        kwargs = specifier.pop(0)
53    else:
54        kwargs = {}
55
56    error = None
57    for ep in pkg_resources.iter_entry_points(space, name):
58        try:
59            cls = ep.load()
60        except ImportError, e:
61            error = e
62        else:
63            return cls(*args, **kwargs)
64    if error:
65        raise e
66    raise ImportError("No entry point found: %s.%s" % (space, name))
67
68def urlopen_with_timeout(url, timeout):
69    """
70    Opens URL with timeout. Returns file-like object.
71
72    >>> import threading
73    >>> import BaseHTTPServer, SimpleHTTPServer
74    >>> addr, port = ('127.0.0.1', 49152)
75    >>> url = 'http://%s:%d' % (addr, port)
76    >>> ready = threading.Event()
77    >>> shutdown = threading.Event()
78    >>> def serve():
79    ...     srv = BaseHTTPServer.HTTPServer((addr, port),
80    ...             SimpleHTTPServer.SimpleHTTPRequestHandler)
81    ...     ready.set()
82    ...     srv.handle_request()
83    ...     shutdown.wait()
84
85    >>> srv_t = threading.Thread(target=serve)
86    >>> srv_t.start()
87    >>> ignore = ready.wait()
88    >>> f = urlopen_with_timeout(url, 0.1)
89    >>> 'content-type' in f.info()
90    True
91    >>> try: f = urlopen_with_timeout(url, 0.1)
92    ... except urlopen_errors: f = None
93    >>> f is None
94    True
95    >>> shutdown.set()
96    >>> srv_t.join()
97    """
98    if sys.version_info >= (2, 6):
99        return urllib2.urlopen(url, None, timeout)
100
101    class _NonBlockingHTTPConnection(httplib.HTTPConnection):
102        def connect(self):
103            httplib.HTTPConnection.connect(self)
104            self.sock.settimeout(timeout)
105
106    if sys.version_info < (2, 4):
107        class _NonBlockingHTTP(httplib.HTTP):
108            _connection_class = _NonBlockingHTTPConnection
109     
110    class _NonBlockingHTTPHandler(urllib2.HTTPHandler):
111        def http_open(self, req):
112            if sys.version_info < (2, 4):
113                return self.do_open(_NonBlockingHTTP, req)
114            return self.do_open(_NonBlockingHTTPConnection, req)
115
116    return urllib2.build_opener(_NonBlockingHTTPHandler).open(url)
117
118urlopen_errors = (urllib2.URLError, httplib.HTTPException, OSError,
119        socket.error, socket.herror, socket.gaierror, socket.timeout)
120
121def normalize_uri(uri):
122    if isinstance(uri, unicode):
123        uri = uri.encode('utf-8')
124    (s, n, p, q, f) = urlparse.urlsplit(uri)
125    p = urllib.quote(urllib.unquote(p), '/')
126    p = p[:1] + p[1:].rstrip('/')
127    return urlparse.urlunsplit((s, n, p, q, f))
128
129def rearrange_blocks(request, blocks, rearrange_map = None):
130    """
131    Rearranges links blocks according to rearrange_map and depending of request
132    URI.
133
134    >>> from linkexchange.clients import PageRequest
135    >>> request = PageRequest(host='example.com', uri='/')
136    >>> ord('/')
137    47
138    >>> blocks = [u'b1', u'b2', u'b3']
139    >>> rearrange_map = [(0, 2, 0, 3), (2, 3, 3, 5)]
140    >>> rearrange_blocks(request, blocks, rearrange_map)
141    [u'b2', u'', u'b1', u'', u'b3']
142    """
143    if rearrange_map is None:
144        rearrange_map = [(0, len(blocks), 0, len(blocks))]
145    req_sum = sum([ord(x) for x in request.uri])
146    result_dic = {}
147    result_len = 0
148    for i1, i2, o1, o2 in rearrange_map:
149        ia = blocks[i1:i2]
150        oi = o1 + (req_sum % (o2 - o1))
151        while ia:
152            if oi not in result_dic:
153                result_dic[oi] = ia.pop(0)
154            oi += 1
155            if oi >= o2:
156                oi = o1
157        if o2 > result_len:
158            result_len = o2
159    return [result_dic.get(i, u"") for i in range(0, result_len)]
160
161def parse_rearrange_map(map_str):
162    """
163    Parse rearrange map string as it specified in the configuration file.
164
165    >>> parse_rearrange_map('0:1-0:3,1:2-3:5,2:3-0:3')
166    [(0, 1, 0, 3), (1, 2, 3, 5), (2, 3, 0, 3)]
167    """
168    def parse_entry(entry):
169        entry = entry.strip()
170        i, o = entry.split('-')
171        i1, i2 = i.split(':')
172        o1, o2 = o.split(':')
173        return (int(i1), int(i2), int(o1), int(o2))
174    try:
175        return map(parse_entry, map_str.split(','))
176    except ValueError:
177        raise ValueError("Invalid rearrange map string")
178
179def configure_logger(handler = None, formatter = None, level = None):
180    logger = logging.getLogger('linkexchange')
181    if handler is None:
182        handler = logging.StreamHandler()
183        if formatter is None:
184            formatter = logging.Formatter("%(levelname)s: %(message)s")
185        handler.setFormatter(formatter)
186    try:
187        logger.removeHandler(logger._lx_handler)
188    except AttributeError:
189        pass
190    logger.addHandler(handler)
191    logger._lx_handler = handler
192    if level is not None:
193        logger.setLevel(level)
194
195class LinkFinder(HTMLParser.HTMLParser):
196    def __init__(self, found_callback=None):
197        HTMLParser.HTMLParser.__init__(self)
198        self.char_buf = []
199        self.links = []
200        if found_callback is None:
201            found_callback = lambda *args: self.links.append(args)
202        self.found_callback = found_callback
203        self.exclude_tags = set(['a',
204            'textarea', 'select', 'script', 'style',
205            'label', 'noscript' , 'noindex', 'button'])
206        self.exclude_ctx = []
207        self.in_link = False
208
209    def handle_starttag(self, tag, attrs):
210        self.handle_realdata()
211        if tag == 'a' and not self.exclude_ctx:
212            self.in_link = True
213            self.in_link_attrs = attrs
214            self.in_link_text = ''
215        if tag in self.exclude_tags:
216            self.exclude_ctx.append(tag)
217
218    def handle_endtag(self, tag):
219        self.handle_realdata()
220        if tag == 'a' and self.in_link:
221            self.found_callback(
222                    dict(self.in_link_attrs), self.in_link_text)
223            self.in_link = False
224        if tag in self.exclude_tags:
225            self.exclude_ctx.pop()
226
227    def handle_startendtag(self, tag, attrs):
228        self.handle_realdata()
229
230    def handle_data(self, data):
231        self.char_buf.append(data)
232
233    def handle_charref(self, name):
234        self.char_buf.append('&#%s;' % name)
235
236    def handle_entityref(self, name):
237        self.char_buf.append('&%s;' % name)
238
239    def handle_realdata(self):
240        content = ''.join(self.char_buf)
241        self.char_buf[:] = []
242        if self.in_link:
243            self.in_link_text += content
244
245def find_links(html):
246    """
247    Searches for links in HTML code.
248
249    >>> find_links('<p>Some text <a href="/ref">with link</a>.</p>')
250    [({'href': '/ref'}, 'with link')]
251    """
252    finder = LinkFinder()
253    finder.feed(html)
254    finder.close()
255    return finder.links
256
257if __name__ == "__main__":
258    import doctest
259    doctest.testmod()
Note: See TracBrowser for help on using the repository browser.