中国采购招标网爬虫采集破解!你学会了吗?( 二 )

5. 详细代码
#!/usr/bin/python3# encoding: utf-8""" @version: v1.0 @author: W_H_J @license: Apache Licence@contact: 415900617@qq.com @software: PyCharm @file: cookieDecodeJs.py @time: 2020/10/19 21:32 @describe: 中国采购招标网COOKIE解密JS破解"""import sysimport osimport time from requests.adapters import HTTPAdapterfrom selenium import webdriverfrom selenium.webdriver import ChromeOptionsfrom pyquery import PyQuery as pqimport execjsimport requests_htmlsys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))sys.path.append("..") SESSION = requests_html.HTMLSession()SESSION.mount('http://', HTTPAdapter(max_retries=6))SESSION.mount('https://', HTTPAdapter(max_retries=6)) HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36','Host': 'shanxi.chinabidding.cc'}def first_cookie_decode(base_url):"""破解首次加密:param base_url: 原始请求URL:return: 浏览器请求头 , 首次破解加密字段一"""response = SESSION.get(base_url, headers=HEADERS, timeout=(8, 8))if response.status_code == 521:cookies = response.cookiesstr_js_cookie = response.text.replace("", "")print("0. 待破解字段==>", str_js_cookie)# 获取加密字段内容js_result = execjs.eval(str_js_cookie).split(";")[0]print("1. 待破解加密字段一==>", js_result)cookies_text = ';'.join(['='.join(item) for item in cookies.items()])print("2. 加密字段一==>", cookies_text)# 此字段可连续使用HEADERS['cookie'] = cookies_text + "; " + js_resultelse:print("状态不为521 , 可直接使用-first_cookie_decode")return HEADERS, cookies_textdef second_cookie_decode(base_url):"""破解第二段加密cookie:param base_url: 原始请求URL:return: 破解完的加密js代码片段 , 通过内部api请求模拟获取最终加密结果 , 此结果可多次使用"""HEADERS_F, cookies_first_decode = first_cookie_decode(base_url)# 浏览器请求头 , 第一个加密字段response = SESSION.get(base_url, headers=HEADERS_F, timeout=(8, 8))if response.status_code == 521:text_second_521 = response.textjs_cookie = text_second_521[text_second_521.find('go({"bts":[') + 12:text_second_521.find('"],"chars')].split('","')print("3. 待破解加密字段二==>", js_cookie[0] + js_cookie[1])# 二次破解cookie加密字段str_base = text_second_521print("4. 待破解字段三==>", str_base)str_ie_cookie = str_base.replace(" ", "")[str_base.replace(" ", "").find("'ie']=") + 6:str_base.replace(" ", "").find("location[")]print("5. 破解的加密JS内容==>", str_ie_cookie)str_cookie_temp = '''var cookies={}\nreturn cookies;'''.format(str_ie_cookie)str_base_temp = str_base[str_base.rfind(")]);if(") + 4:str_base.rfind("}};go({") + 1]str_back_fun = '''var str_back = back();console.log(str_back); setTimeout(function() {document.getElementById('cookieId').innerHTML=str_back;}, 500);'''str_js_cookie = str_base.replace(str_base_temp, str_cookie_temp).replace("go({","var back_cookie = go({").replace("})", "}); return back_cookie;}" + str_back_fun).replace("", "function back(){")with open("./statisticalDataSpider/spider/business/cookieDecode/static/cookie_js.js", 'w', encoding="utf-8") as f:f.write(str_js_cookie)# 保存二次加密的js到本地然后通过本地起一个服务 , 使用selenium执行 , 获取正确的加密cookieelse:print("状态不为521 , 可直接使用")return cookies_first_decode, response.status_codedef third_cookie_decode(status_code):"""进行第二次解密参数破解:return: __jsl_clearance=1603095437|0|IIfaCCOxqoEKlTN7dqVU%2Blb2ypw%3D"""# 需要chrome及chromdriver均需使用版本73.0if status_code == 521:options = ChromeOptions()options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})# 不加载图片,加快访问速度options.add_argument('--disable-gpu')options.add_argument('--no-sandbox')# 手动指定使用的浏览器位置# options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"options.add_argument('user-agent="{}"'.format(HEADERS['User-Agent']))options.add_experimental_option('excludeSwitches', ['enable-automation'])# 此步骤很重要 , 设置为开发者模式 , 防止被各大网站识别出来使用了Seleniumchrome_path = r"./statisticalDataSpider/spider/business/cookieDecode/static/chromedriver.exe"browser = webdriver.Chrome(executable_path=chrome_path,options=options)browser.get('')#自己起了一个本地服务 , 让selenium执行jstime.sleep(5)page = browser.page_sourcedoc = pq(page)js_cookie = doc("#cookieId").text()browser.close()if js_cookie != "undefined" and js_cookie != "默认":third_cookie = js_cookie.split(";")[0]return third_cookieelse:return Nonedef get_decode_cookie(base_url):"""最终破解成功的cookie:param base_url: 原始URL:return:"""cookie_first_decode, status_code = second_cookie_decode(base_url)cookie_second_decode = third_cookie_decode(status_code)if cookie_second_decode is not None:print("6. 破解加密字段一成功==>", cookie_first_decode)print("7. 破解加密字段二成功==>", cookie_second_decode)HEADERS['cookie'] = cookie_first_decode + "; " + cookie_second_decodeprint("8. 解密后的Headers==>", HEADERS)return HEADERSelse:return Nonedef get_html(base_url):headers = get_decode_cookie(base_url)html = SESSION.get(base_url, headers=headers, verify=False, timeout=(5, 5))print("9. 最终返回状态码==>", html.status_code)print("BODY", html.text)return html.status_code, html.textif __name__ == '__main__':url = 'lists.html?page=4 --tt-darkmode-color: #999999;">PS:如有需要Python学习资料的小伙伴可以加点击下方链接自行获取