Example webproxy.pyΒΆ

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python
"""A web application that retrieves other websites for you.

To start serving the application on port 8088, type

  python webproxy.py

To start the server on some other interface/port, use

  python -m gevent.wsgi -p 8000 -i 0.0.0.0 webproxy.py

"""
from __future__ import print_function
from gevent import monkey; monkey.patch_all()
import sys
import re
import traceback
from cgi import escape

try:
    import urllib2
    from urlparse import urlparse
    from urllib import unquote
except ImportError:
    # pylint:disable=import-error,no-name-in-module
    from urllib import request as urllib2
    from urllib.parse import urlparse
    from urllib.parse import unquote

LISTEN = ('127.0.0.1', 8088)


def _as_bytes(s):
    if not isinstance(s, bytes): # Py3
        s = s.encode('utf-8')
    return s


def _as_str(s):
    if not isinstance(s, str): # Py3
        s = s.decode('latin-1')
    return s


def application(env, start_response):
    proxy_url = 'http://%s/' % env['HTTP_HOST']
    method = env['REQUEST_METHOD']
    path = env['PATH_INFO']
    if env['QUERY_STRING']:
        path += '?' + env['QUERY_STRING']
    path = path.lstrip('/')
    if (method, path) == ('GET', ''):
        start_response('200 OK', [('Content-Type', 'text/html')])
        return [FORM]
    elif method == 'GET':
        return proxy(path, start_response, proxy_url)
    elif (method, path) == ('POST', ''):
        key, value = env['wsgi.input'].read().strip().split(b'=')
        assert key == b'url', repr(key)
        value = _as_str(value)
        start_response('302 Found', [('Location', _as_str(join(proxy_url, unquote(value))))])
    elif method == 'POST':
        start_response('404 Not Found', [])
    else:
        start_response('501 Not Implemented', [])
    return []


def proxy(path, start_response, proxy_url):
    # pylint:disable=too-many-locals
    if '://' not in path:
        path = 'http://' + path
    try:
        try:
            response = urllib2.urlopen(path)
        except urllib2.HTTPError as ex:
            response = ex
        print('%s: %s %s' % (path, response.code, response.msg))
        headers = [(k, v) for (k, v) in response.headers.items() if k not in drop_headers]
        scheme, netloc, path, _params, _query, _fragment = urlparse(path)
        host = (scheme or 'http') + '://' + netloc
    except Exception as ex: # pylint:disable=broad-except
        sys.stderr.write('error while reading %s:\n' % path)
        traceback.print_exc()
        tb = traceback.format_exc()
        start_response('502 Bad Gateway', [('Content-Type', 'text/html')])
        # pylint:disable=deprecated-method
        error_str = escape(str(ex) or ex.__class__.__name__ or 'Error')
        error_str = '<h1>%s</h1><h2>%s</h2><pre>%s</pre>' % (error_str, escape(path), escape(tb))
        return [_as_bytes(error_str)]
    else:
        start_response('%s %s' % (response.code, response.msg), headers)
        data = response.read()
        data = fix_links(data, proxy_url, host)
        return [data]


def join(url1, *rest):
    if not rest:
        return url1
    url2, rest = rest[0], rest[1:]
    url1 = _as_bytes(url1)
    url2 = _as_bytes(url2)
    if url1.endswith(b'/'):
        if url2.startswith(b'/'):
            return join(url1 + url2[1:], *rest)
        return join(url1 + url2, *rest)
    elif url2.startswith(b'/'):
        return join(url1 + url2, *rest)

    return join(url1 + b'/' + url2, *rest)


def fix_links(data, proxy_url, host_url):
    """
    >>> fix_links("><img src=images/hp0.gif width=158", 'http://127.0.0.1:8088', 'www.google.com')
    '><img src="http://127.0.0.1:8088/www.google.com/images/hp0.gif" width=158'
    """
    def fix_link_cb(m):
        url = m.group('url')
        if b'://' in url:
            result = m.group('before') + b'"' + join(proxy_url, url) + b'"'
        else:
            result = m.group('before') + b'"' + join(proxy_url, host_url, url) + b'"'
        #print('replaced %r -> %r' % (m.group(0), result))
        return result
    data = _link_re_1.sub(fix_link_cb, data)
    data = _link_re_2.sub(fix_link_cb, data)
    return data

_link_re_1 = re.compile(br'''(?P<before>(href|src|action)\s*=\s*)(?P<quote>['"])(?P<url>[^#].*?)(?P=quote)''')
_link_re_2 = re.compile(br'''(?P<before>(href|src|action)\s*=\s*)(?P<url>[^'"#>][^ >]*)''')

drop_headers = ['transfer-encoding', 'set-cookie']

FORM = b"""<html><head>
<title>Web Proxy - gevent example</title></head><body>
<table width=60% height=100% align=center>
<tr height=30%><td align=center valign=bottom>Type in URL you want to visit and press Enter</td></tr>
<tr><td align=center valign=top>
<form action=/ method=post>
<input size=80 name=url value="http://www.gevent.org"/>
</form>
</td></tr>
</table></body></table>
"""

if __name__ == '__main__':
    from gevent.pywsgi import WSGIServer
    print('Serving on %s...' % (LISTEN,))
    WSGIServer(LISTEN, application).serve_forever()

Current source

Next page: Example webpy.py