# -*- coding: UTF-8 -*-
from urllib import request
from urllib import parse
from urllib.request import urlopen
# from lxml import etree
from bs4 import BeautifulSoup
import csv
import time
import requests
from util import *
QICHACHA_DOMAIN_URL = 'http://www.qichacha.com/'
STATIC_QICHACHA_HEADER = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate',
'accept-language': 'Accept-Language',
'referer': 'http://www.qichacha.com/',
}
COOKIES = 'PHPSESSID=5cjg03b6i7fhp1tt97s8j5i8a7; UM_distinctid=163200269a120d-0a85e4718193cd-444a022e-1fa400-163200269a3383; CNZZDATA1254842228=1838418069-1525246977-%7C1525246977; zg_did=%7B%22did%22%3A%20%22163200269db251-00405643f13362-444a022e-1fa400-163200269dc379%22%7D; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1525250419; hasShow=1; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1525250494; acw_tc=AQAAADaAKGFLHQ8A9k33dIbZywjU8Hvy; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201525250419175%2C%22updated%22%3A%201525250646947%2C%22info%22%3A%201525250419178%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%7D'
def set_no_keep_alive():
s = requests.session()
s.keep_alive = False
def get_base_resp(part_url, cookies):
headers = STATIC_QICHACHA_HEADER
headers['cookie'] = cookies
res = requests.get(QICHACHA_DOMAIN_URL + part_url, headers=headers)
return res
def get_search_resp(name):
part = {'key': name}
part_url = 'search?' + parse.urlencode(part)
res = get_base_resp(part_url, COOKIES)
return res
def parse_search_resp(res):
parser = BeautifulSoup(res.text, 'lxml')
tags = parser.select('table[class="m_srchList"]')
try:
tds = tags[0].select('td')
url = tds[1].a.attrs['href']
except:
url = 'empty'
return url
def parse_detail_resp(res, name, datas):
# parser = BeautifulSoup(res.text)
# parser = BeautifulSoup(res.text, 'html.parser')
# parser = BeautifulSoup(res.text, 'lxml')
# parser = BeautifulSoup(res.text, 'xml')
# parser = BeautifulSoup(res.text, 'html5lib')
parser = BeautifulSoup(res.text, 'lxml')
# and tag.attrs['class'] == 'ntable'
# tags = parser.find_all(lambda tag: tag.has_attr('class') and not tag.has_attr('style'))
tags = parser.select('table[class="ntable"]')
# for tag in tags:
table1 = tags[1]
tds = table1.select('td')
reg_cap = tds[1].text.strip()
act_cap = tds[3].text.strip()
bus_state = tds[5].text.strip()
estab_date = tds[7].text.strip()
reg_num = tds[9].text.strip()
org_code = tds[11].text.strip()
uscc = tds[13].text.strip()
tax_id = tds[15].text.strip()
com_type = tds[17].text.strip()
industry = tds[19].text.strip()
valid_date = tds[21].text.strip()
reg_auth = tds[23].text.strip()
region = tds[25].text.strip()
eng_name = tds[27].text.strip()
old_name = tds[29].text.strip()
bus_method = tds[31].text.strip()
per_scale = tds[33].text.strip()
bus_date_range = tds[35].text.strip()
address = tds[37].text.strip()
bus_scope = tds[39].text.strip()
value = [name,
reg_cap, act_cap, bus_state, estab_date, reg_num, org_code, uscc, tax_id, com_type,
industry, valid_date, reg_auth, region, eng_name, old_name, bus_method, per_scale, bus_date_range, address, bus_scope]
datas.append(value)
print(value)
# print(parser.prettify())
# def parse_resp_old(res):
# doc = etree.parse(res.text)
# table1 = doc.xpath("//table[@class='ntable'][1]")
# print(table1)
def fetch_urls(srcPath, dstPath):
name_list = fileutil.readfile_to_list(srcPath)
datas = []
for name in name_list:
res = get_search_resp(name)
url = parse_search_resp(res)
datas.append([name, url])
print(name, url)
time.sleep(5)
with open(dstPath, 'w', newline='', encoding='utf_8_sig') as f:
writer = csv.writer(f)
for row in datas:
writer.writerow(row)
def fetch_details(srcPath, dstPath):
data_list = fileutil.readfile_to_list(srcPath)
name_rows = []
for data in data_list:
arr = data.split(',')
name_rows.append([arr[0], arr[1]])
datas = []
datas.append([u'公司名称', u'注册资本', u'实缴资本', u'经营状态', u'成立日期', u'注册号', u'组织机构代码',
u'统一社会信用代码', u'纳税人识别号', u'公司类型', u'所属行业', u'核准日期', u'登记机关', u'所属地区',
u'英文名', u'曾用名', u'经营方式', u'人员规模', u'营业期限', u'企业地址', u'经营范围', ])
for data in name_rows:
name = data[0]
part = data[1]
if part == 'empty':
datas.append([name,part])
print(name, part)
else:
res = get_base_resp(part[1:], COOKIES)
parse_detail_resp(res, name, datas)
time.sleep(5)
with open(dstPath, 'w', newline='', encoding='utf_8_sig') as f:
writer = csv.writer(f)
for row in datas:
writer.writerow(row)
if __name__ == '__main__':
set_no_keep_alive()
# res = get_search_resp(u'格力电器')
# print(res.text)
# name = u'格力电器'
# res = get_base_resp('/firm_127fcf53de2598be86ab1b3cad165cf1.html'[1:], COOKIES)
# datas = []
# parse_detail_resp(res, name, datas)
# print(res.text)
# name = u'格力电器'
# res = get_search_resp(name)
# url = parse_search_resp(res)
# print(name, url)
fetch_urls('D:/ISO/mdsdataclean/input/company.csv', 'input/company_urls.csv')
fetch_details('input/company_urls.csv', 'output/qichacha.csv')