【python】获取http响应

  • Post author:
  • Post category:python


一个相对完整的http请求,输入ip和端口,输出响应码,响应头,响应体,是否超时,以及出错时的错误信息

处理包括:

1.协议处理,如果是443用https,其他用http

2.HTTPError处理,HTTPError一般是401,403,404之类的错误,虽然报错,但是也有响应头。注意获取错误信息时要用str(e),其他的比如repr(e)得到的不是字符串,e.read()是响应体,不是错误原因

3.URLError处理,一般是Connection refused之类的错误。注意获取错误信息时要用str(e.reason)

4.响应体gzip解压

5.响应体编码转换

# coding=utf8

import urllib2
import chardet
import traceback
import StringIO
import re
import gzip


def plugin_homepage(data, timeout):
    ip = data["ip"]
    port = data["port"]
    if port == 443:
        url = "https://%s:%s/" % (ip, port)
    else:
        url = "http://%s:%s/" % (ip, port)
    is_timeout, error_reason, code, header, body, title = get_html(url, timeout)
    res = {"ip": ip,
           "port": port,
           "rsp_header": header,
           "rsp_body": body,
           "code": code,
           "title": title,
           "is_timeout": is_timeout,
           "error_reason": error_reason}
    return res


def get_html(url, timeout):
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent': user_agent}
    is_timeout = False
    error_reason = None
    code = None
    header = None
    body = None
    title = None
    try:
        request = urllib2.Request(url, headers=headers)
        response = urllib2.urlopen(request, timeout=timeout)
        code = response.getcode()
        body = response.read()
        header = str(response.headers)
    except urllib2.HTTPError, e:   # 处理http错误
        # print "str(e):%s\nrepr(e):%s\ne:%s\ne.read():%s\n" % (str(e), repr(e), e, e.read())
        error_reason = str(e)
        body = e.read()
        header = e.headers
    except urllib2.URLError, e:
        print traceback.print_exc()
        error_reason = str(e.reason)
        if error_reason == "timed out":  # 判断是否超时
            is_timeout = True
        return is_timeout, error_reason, code, header, body, title
    except Exception, e:
        print traceback.print_exc()
        error_reason = str(e)
        return is_timeout, error_reason, code, header, body, title
    if not header:
        return is_timeout, error_reason, code, header, body, title
    # 解压gzip
    if 'Content-Encoding' in header and 'gzip' in header['Content-Encoding']:
        html_data = StringIO.StringIO(body)
        gz = gzip.GzipFile(fileobj=html_data)
        body = gz.read()
    # 编码转换
    try:
        html_encode = get_encode(header, body).strip()
        if html_encode and len(html_encode) < 12:
            body = body.decode(html_encode).encode('utf-8')
    except:
        pass
    # 获取title
    try:
        title = re.search(r'<title>(.*?)</title>', body, flags=re.I | re.M)
        if title:
            title = title.group(1)
    except:
        pass
    return is_timeout, error_reason, code, str(header), body, title


# 获取html编码
def get_encode(header, body):
    try:
        m = re.search(r'<meta.*?charset=(.*?)"(>| |/)', body, flags=re.I)
        if m:
            return m.group(1).replace('"', '')
    except:
        pass
    try:
        if 'Content-Type' in header:
            Content_Type = header['Content-Type']
            m = re.search(r'.*?charset=(.*?)(;|$)', Content_Type, flags=re.I)
            if m:
                return m.group(1)
    except:
        pass
    chardit1 = chardet.detect(body)
    encode_method = chardit1['encoding']
    return encode_method


if __name__ == "__main__":
    data = {"ip": "127.0.0.1", "port": 80}
    res = plugin_homepage(data, 3)
    print res