Example webproxy.py#

  1#!/usr/bin/env python
  2"""A web application that retrieves other websites for you.
  3
  4To start serving the application on port 8088, type
  5
  6  python webproxy.py
  7
  8To start the server on some other interface/port, use
  9
 10  python -m gevent.wsgi -p 8000 -i 0.0.0.0 webproxy.py
 11
 12"""
 13from __future__ import print_function
 14from gevent import monkey; monkey.patch_all()
 15import sys
 16import re
 17import traceback
 18
 19try:
 20    from cgi import escape
 21except ImportError:
 22    # Python 3.8 removed this API
 23    from html import escape
 24
 25try:
 26    import urllib2
 27    from urlparse import urlparse
 28    from urllib import unquote
 29except ImportError:
 30    # pylint:disable=import-error,no-name-in-module
 31    from urllib import request as urllib2
 32    from urllib.parse import urlparse
 33    from urllib.parse import unquote
 34
 35LISTEN = ('127.0.0.1', 8088)
 36
 37
 38def _as_bytes(s):
 39    if not isinstance(s, bytes): # Py3
 40        s = s.encode('utf-8')
 41    return s
 42
 43
 44def _as_str(s):
 45    if not isinstance(s, str): # Py3
 46        s = s.decode('latin-1')
 47    return s
 48
 49
 50def application(env, start_response):
 51    proxy_url = 'http://%s/' % env['HTTP_HOST']
 52    method = env['REQUEST_METHOD']
 53    path = env['PATH_INFO']
 54    if env['QUERY_STRING']:
 55        path += '?' + env['QUERY_STRING']
 56    path = path.lstrip('/')
 57
 58    if (method, path) == ('GET', ''):
 59        start_response('200 OK', [('Content-Type', 'text/html')])
 60        return [FORM]
 61
 62    if method == 'GET':
 63        return proxy(path, start_response, proxy_url)
 64
 65    if (method, path) == ('POST', ''):
 66        key, value = env['wsgi.input'].read().strip().split(b'=')
 67        assert key == b'url', repr(key)
 68        value = _as_str(value)
 69        start_response('302 Found', [('Location', _as_str(join(proxy_url, unquote(value))))])
 70    elif method == 'POST':
 71        start_response('404 Not Found', [])
 72    else:
 73        start_response('501 Not Implemented', [])
 74    return []
 75
 76
 77def proxy(path, start_response, proxy_url):
 78    # pylint:disable=too-many-locals
 79    if '://' not in path:
 80        path = 'http://' + path
 81
 82    try:
 83        try:
 84            response = urllib2.urlopen(path)
 85        except urllib2.HTTPError as ex:
 86            response = ex
 87        print('%s: %s %s' % (path, response.code, response.msg))
 88        # Beginning in Python 3.8, headers aren't guaranteed to arrive in
 89        # lowercase; we must do so ourself.
 90        headers = [(k, v) for (k, v) in response.headers.items() if k.lower() not in DROP_HEADERS]
 91        scheme, netloc, path, _params, _query, _fragment = urlparse(path)
 92        host = (scheme or 'http') + '://' + netloc
 93    except Exception as ex: # pylint:disable=broad-except
 94        sys.stderr.write('error while reading %s:\n' % path)
 95        traceback.print_exc()
 96        tb = traceback.format_exc()
 97        start_response('502 Bad Gateway', [('Content-Type', 'text/html')])
 98        # pylint:disable=deprecated-method
 99        error_str = escape(str(ex) or ex.__class__.__name__ or 'Error')
100        error_str = '<h1>%s</h1><h2>%s</h2><pre>%s</pre>' % (error_str, escape(path), escape(tb))
101        return [_as_bytes(error_str)]
102    else:
103        print("Returning", headers)
104        start_response('%s %s' % (response.code, response.msg), headers)
105        data = response.read()
106        data = fix_links(data, proxy_url, host)
107        return [data]
108
109
110def join(url1, *rest):
111    if not rest:
112        return url1
113    url2, rest = rest[0], rest[1:]
114    url1 = _as_bytes(url1)
115    url2 = _as_bytes(url2)
116    if url1.endswith(b'/'):
117        if url2.startswith(b'/'):
118            return join(url1 + url2[1:], *rest)
119        return join(url1 + url2, *rest)
120
121    if url2.startswith(b'/'):
122        return join(url1 + url2, *rest)
123
124    return join(url1 + b'/' + url2, *rest)
125
126
127def fix_links(data, proxy_url, host_url):
128    """
129    >>> fix_links("><img src=images/hp0.gif width=158", 'http://127.0.0.1:8088', 'www.google.com')
130    '><img src="http://127.0.0.1:8088/www.google.com/images/hp0.gif" width=158'
131    """
132    def fix_link_cb(m):
133        url = m.group('url')
134        if b'://' in url:
135            result = m.group('before') + b'"' + join(proxy_url, url) + b'"'
136        else:
137            result = m.group('before') + b'"' + join(proxy_url, host_url, url) + b'"'
138        #print('replaced %r -> %r' % (m.group(0), result))
139        return result
140    data = _link_re_1.sub(fix_link_cb, data)
141    data = _link_re_2.sub(fix_link_cb, data)
142    return data
143
144_link_re_1 = re.compile(br'''(?P<before>(href|src|action)\s*=\s*)(?P<quote>['"])(?P<url>[^#].*?)(?P=quote)''')
145_link_re_2 = re.compile(br'''(?P<before>(href|src|action)\s*=\s*)(?P<url>[^'"#>][^ >]*)''')
146
147# The lowercase names of headers that we will *NOT* forward.
148DROP_HEADERS = {
149    'transfer-encoding',
150    'set-cookie'
151}
152
153FORM = b"""<html><head>
154<title>Web Proxy - gevent example</title></head><body>
155<table width=60% height=100% align=center>
156<tr height=30%><td align=center valign=bottom>Type in URL you want to visit and press Enter</td></tr>
157<tr><td align=center valign=top>
158<form action=/ method=post>
159<input size=80 name=url value="http://www.gevent.org"/>
160</form>
161</td></tr>
162</table></body></table>
163"""
164
165if __name__ == '__main__':
166    from gevent.pywsgi import WSGIServer
167    print('Serving on %s...' % (LISTEN,))
168    WSGIServer(LISTEN, application).serve_forever()

Current source