企查查

# -*- coding: UTF-8 -*-

from urllib import request
from urllib import parse
from urllib.request import urlopen
# from lxml import etree

from bs4 import BeautifulSoup

import csv
import time

import requests

from util import *



QICHACHA_DOMAIN_URL = 'http://www.qichacha.com/'

STATIC_QICHACHA_HEADER = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
               'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
               'accept-encoding': 'gzip, deflate',
               'accept-language': 'Accept-Language',
               'referer': 'http://www.qichacha.com/',
                          }

COOKIES = 'PHPSESSID=5cjg03b6i7fhp1tt97s8j5i8a7; UM_distinctid=163200269a120d-0a85e4718193cd-444a022e-1fa400-163200269a3383; CNZZDATA1254842228=1838418069-1525246977-%7C1525246977; zg_did=%7B%22did%22%3A%20%22163200269db251-00405643f13362-444a022e-1fa400-163200269dc379%22%7D; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1525250419; hasShow=1; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1525250494; acw_tc=AQAAADaAKGFLHQ8A9k33dIbZywjU8Hvy; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201525250419175%2C%22updated%22%3A%201525250646947%2C%22info%22%3A%201525250419178%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%7D'


def set_no_keep_alive():
    s = requests.session()
    s.keep_alive = False

def get_base_resp(part_url, cookies):
    headers = STATIC_QICHACHA_HEADER
    headers['cookie'] = cookies

    res = requests.get(QICHACHA_DOMAIN_URL + part_url, headers=headers)
    return res

def get_search_resp(name):
    part = {'key': name}
    part_url = 'search?' + parse.urlencode(part)
    res = get_base_resp(part_url, COOKIES)
    return res

def parse_search_resp(res):
    parser = BeautifulSoup(res.text, 'lxml')
    tags = parser.select('table[class="m_srchList"]')

    try:
        tds = tags[0].select('td')
        url = tds[1].a.attrs['href']
    except:
        url = 'empty'
    return url

def parse_detail_resp(res, name, datas):
    # parser = BeautifulSoup(res.text)
    # parser = BeautifulSoup(res.text, 'html.parser')
    # parser = BeautifulSoup(res.text, 'lxml')
    # parser = BeautifulSoup(res.text, 'xml')
    # parser = BeautifulSoup(res.text, 'html5lib')

    parser = BeautifulSoup(res.text, 'lxml')
    # and tag.attrs['class'] == 'ntable'
    # tags = parser.find_all(lambda tag: tag.has_attr('class') and not tag.has_attr('style'))
    tags = parser.select('table[class="ntable"]')
    # for tag in tags:
    table1 = tags[1]

    tds = table1.select('td')

    reg_cap = tds[1].text.strip()
    act_cap = tds[3].text.strip()
    bus_state = tds[5].text.strip()
    estab_date = tds[7].text.strip()
    reg_num = tds[9].text.strip()
    org_code = tds[11].text.strip()
    uscc = tds[13].text.strip()
    tax_id = tds[15].text.strip()
    com_type = tds[17].text.strip()
    industry = tds[19].text.strip()
    valid_date = tds[21].text.strip()
    reg_auth = tds[23].text.strip()
    region = tds[25].text.strip()
    eng_name = tds[27].text.strip()
    old_name = tds[29].text.strip()
    bus_method = tds[31].text.strip()
    per_scale = tds[33].text.strip()
    bus_date_range = tds[35].text.strip()
    address = tds[37].text.strip()
    bus_scope = tds[39].text.strip()

    value = [name,
        reg_cap, act_cap, bus_state, estab_date, reg_num, org_code, uscc, tax_id, com_type,
        industry, valid_date, reg_auth, region, eng_name, old_name, bus_method, per_scale, bus_date_range, address, bus_scope]
    datas.append(value)
    print(value)
    # print(parser.prettify())



# def parse_resp_old(res):
    # doc = etree.parse(res.text)
    # table1 = doc.xpath("//table[@class='ntable'][1]")
    # print(table1)

def fetch_urls(srcPath, dstPath):
    name_list = fileutil.readfile_to_list(srcPath)
    datas = []
    for name in name_list:
        res = get_search_resp(name)
        url = parse_search_resp(res)
        datas.append([name, url])
        print(name, url)
        time.sleep(5)

    with open(dstPath, 'w', newline='', encoding='utf_8_sig') as f:
        writer = csv.writer(f)
        for row in datas:
            writer.writerow(row)

def fetch_details(srcPath, dstPath):
    data_list = fileutil.readfile_to_list(srcPath)
    name_rows = []
    for data in data_list:
        arr = data.split(',')
        name_rows.append([arr[0], arr[1]])

    datas = []
    datas.append([u'公司名称', u'注册资本', u'实缴资本', u'经营状态', u'成立日期', u'注册号', u'组织机构代码',
                  u'统一社会信用代码', u'纳税人识别号', u'公司类型', u'所属行业', u'核准日期', u'登记机关', u'所属地区',
                  u'英文名', u'曾用名', u'经营方式', u'人员规模', u'营业期限', u'企业地址', u'经营范围', ])
    for data in name_rows:
        name = data[0]
        part = data[1]
        if part == 'empty':
            datas.append([name,part])
            print(name, part)
        else:
            res = get_base_resp(part[1:], COOKIES)
            parse_detail_resp(res, name, datas)

        time.sleep(5)

    with open(dstPath, 'w', newline='', encoding='utf_8_sig') as f:
        writer = csv.writer(f)
        for row in datas:
            writer.writerow(row)



if __name__ == '__main__':
    set_no_keep_alive()

    # res = get_search_resp(u'格力电器')
    # print(res.text)

    # name = u'格力电器'
    # res = get_base_resp('/firm_127fcf53de2598be86ab1b3cad165cf1.html'[1:], COOKIES)
    # datas = []
    # parse_detail_resp(res, name, datas)
    # print(res.text)

    # name = u'格力电器'
    # res = get_search_resp(name)
    # url = parse_search_resp(res)
    # print(name, url)

    fetch_urls('D:/ISO/mdsdataclean/input/company.csv', 'input/company_urls.csv')

    fetch_details('input/company_urls.csv', 'output/qichacha.csv')