获取图吧地图终端信息

#!usr/bin/env python3
# Filename:shenzhen_pharm_analysis.py

#http://poi.mapbar.com/shenzhen/D20/


import sys
import random
import requests
import time
import xlrd
import telnetlib
from lxml import html


directory_path = sys.path[0]  # 当前文件夹路径
time0 = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))  # 当前时间
print("当前操作文件夹:" + directory_path + "\n当前时间:" + time0)

# 1.伪造请求头部信息


def proxies_get():  # 获取IP代理列表
    try:
        url_proxies = 'http://www.ip181.com/'
        r = requests.get(url=url_proxies, timeout=20)
        r.raise_for_status()
        html0 = r.text
        ip_text0 = html.fromstring(html0)
        ip_text = ip_text0.xpath('/html/body/div[2]/div[1]/div[2]/div/div[2]/table/tbody')
        for i in ip_text:
            ip_list = [i.xpath('tr[' + str(j) + ']/td[1]/text()') for j in range(2, 102)]
            port_list = [i.xpath('tr[' + str(j) + ']/td[2]/text()') for j in range(2, 102)]
            return ip_list, port_list
    except:
        return 'Proxies Gather Error'


def proxies_filter(lists):  # 测试IP代理并输出有效列表
    list_out = []
    for i in range(len(lists[0])):
        ip_ip = lists[0][i][0]
        ip_port = lists[1][i][0]
        print('Proxy' + str(i + 1) + ':')
        try:
            telnetlib.Telnet(ip_ip, port=ip_port, timeout=5)
        except:
            output = str(ip_ip) + ':' + str(ip_port) + ', Connect Failed'
            print(output)
        else:
            output = str(ip_ip) + ':' + str(ip_port) + ', Proxy Works Well'
            list_out.append(str(ip_ip) + ':' + str(ip_port))
            print(output)
    return list_out


def headers_choice():  # 随机抽取User Agent并输出请求头headers
    ua_list = [
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
        'Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/39.0.1132.57 Safari/536.11',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/48.0.1092.0 Safari/536.6',
        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/60.0.1090.0 Safari/536.6',
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/59.0.1084.9 Safari/536.5',
        'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/27.0.1063.0 Safari/536.3',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/33.0.1063.0 Safari/536.3',
        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24'
    ]
    ua = random.choice(ua_list)
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Host': 'poi.mapbar.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': ua,
    }
    return headers


# 2.获取网页内容

def read_from_excel():
    try:
        file_path = directory_path + '\shenzhen_pharm_url.xlsx'
        data = xlrd.open_workbook(file_path)
        return data
    except:
        print('表格读取失败')


def get_info(url, headers, proxies=None):
    r = requests.get(url, timeout=30, headers=headers, proxies=proxies)
    r.encoding = r.apparent_encoding
    content = r.text
    text = html.fromstring(content)
    text_all =  text.xpath("//ul[@class='POI_ulA']/li[2]//text()")
    text_list =[]
    for i in  text_all:
        i = i.replace('\r', '').replace('\n', '').replace('\t', '').replace(' ', '')
        text_list.append(i)
    text_out = ''.join(text_list)
    return text_out


def save_to_csv(text):  # 保存信息
    with open('baodingzongheyiyuandizhi.csv', 'a+') as f:
        f.write(text)


# 主程序


def main():
    file0 = read_from_excel()  # 读取表格
    table0 = file0.sheet_by_name('Sheet1')
    nrows = table0.nrows
    proxies_list = proxies_get()
    proxies_choice = proxies_filter(proxies_list)
    print('需要查询' + str(nrows) + '行数据\n' + '=' * 40)
    for i in range(nrows):
        headers = headers_choice()
        proxies_output = str(random.choice(proxies_choice))
        # proxies_output = "47.94.194.193:80"  # 测试用单个IP
        proxies = {
            "https": "http://" + proxies_output,
        }
        link = table0.row_values(i)[0]
        name = table0.row_values(i)[1]
        type0 = table0.row_values(i)[2]
        address = get_info(url=link, headers=headers, proxies=proxies)
        output = '{},{},{}\n'.format(address, name, type0)
        print(output)
        save_to_csv(output)


if __name__ == '__main__':
    main()