Python爬虫入门,详细讲解爬虫过程( 二 )


',re.S)result = re.findall(pattern,html)# 方法一:# str = re.sub('\n',',',result[0])# print(str)#方法二:print(result[0].replace('/n',','))0x02 爬取电影信息import requestsimport reimport time# count = [0,10,20,30,40,50,60,70,80,90]h = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}responce = requests.get('', headers=h)responce.encoding = 'utf-8'html = responce.text# 解析数据time.sleep(2)patter = re.compile('class="name">.*?title="(.*?)".*?主演:(.*?)
.*?上映时间:(.*?)
', re.S)#time.sleep(2)result = re.findall(patter, html)print(result)with open('maoyan.txt', 'a', encoding='utf-8') as f:for item in result:# 读取result(以元组的形式储存)中的内容=》for i in item:f.write(i.strip().replace('\n', ','))#print('\n')
0x03 爬取图片import requestsimport reimport time#数据的爬取img的urldef get_urls():response = requests.get('')html_str = response.text#解析数据 , 得到urlpattern = re.compile('
Python爬虫入门,详细讲解爬虫过程Python爬虫入门,详细讲解爬虫过程#下载图片def down_load_img(urls):for url in urls:response = requests.get(url)with open('temp/'+url.split('/')[-1], 'wb') as f:f.write(response.content)print(url.split('/')[-1],'已经下载成功')if __name__ == '__main__':urls = get_urls()down_load_img(urls)
0x04 线程池线程池是一种多线程处理形式 , 处理过程中将任务添加到队列 , 然后在创建线程后自动启动这些任务 。 线程池线程都是后台线程 。 每个线程都使用默认的堆栈大小 , 以默认的优先级运行 , 并处于多线程单元中 。
"""线程池"""from concurrent.futures import ThreadPoolExecutorimport timeimport threadingdef ban_zhuang(i):print(threading.current_thread().name,"**开始搬砖{}**".format(i))time.sleep(2)print("**员工{}搬砖完成**一共搬砖:{}".format(i,12**2))#将format里的内容输出到{}if __name__ == '__main__':#主线程start_time = time.time()print(threading.current_thread().name,"开始搬砖")with ThreadPoolExecutor(max_workers=5) as pool:for i in range(10):p = pool.submit(ban_zhuang,i)end_time =time.time()print("一共搬砖{}秒".format(end_time-start_time))结合多线程的爬虫:
'''美女爬取'''import requestsimport refrom urllib.parse import urlencodeimport timeimport threading#;app_name=web_search&offset=0&format=json&keyword=%E7%BE%8E%E5%A5%B3&autoload=true&count=20def get_urls(page):keys = {'aid':'24','app_name':'web_search','offset':20*page,'keyword':'美女','count':'20'}keys_word = urlencode(keys)url = ''+keys_wordresponse = requests.get(url)print(response.status_code)html_str = response.text# 解析"large_image_url":"(.*?)"pattern = re.compile('"large_image_url":"(.*?)"',re.S)urls = re.findall(pattern, html_str)return urls#下载图片def download_imags(urls):for url in urls:try:response = requests.get(url)with open('pic/'+url.split('/')[-1]+'.jpg','wb') as f:f.write(response.content)print(url.split('/')[-1]+'.jpg',"已下载~~")except Exception as err:print('An exception happened: ')if __name__ == '__main__':start = time.time()thread = []for page in range(3):urls = get_urls(page)#print(urls)#多线程for url in urls:th = threading.Thread(target=download_imags,args=(url,))#download_imags(urls)thread.append(th)for t in thread:t.start()for t in thread:t.join()end = time.time()print('耗时:',end-start)