人妖在线一区,国产日韩欧美一区二区综合在线,国产啪精品视频网站免费,欧美内射深插日本少妇

新聞動態(tài)

Python爬蟲,獲取,解析,存儲詳解

發(fā)布日期:2021-12-28 08:37 | 文章來源:CSDN

1.獲取數(shù)據(jù)

import requests
def drg(url):
 try:
  head ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/\
 537.36 (KHTML, like Gecko) Chrome/\
 91.0.4472.164 Safari/537.36'}
  r = requests.get(url,headers=head)
  r.raise_for_status()  # 如果狀態(tài)不是200,引發(fā)HTTPError異常
  r.encoding = r.apparent_encoding
  return r.text
 except:
  return "產(chǎn)生異常"
url = "https://www.ip138.com/mobile.asp?mobile=13018305773&action=mobile"
print(drg(url))

2.解析數(shù)據(jù)

import requests
def login():
 try:
  # 登錄之后界面的url
  urllogin="http://www.cqooc.com/user/login?username=12608199000635&password=48C032612C2A6777D28A969307B52127E198D59AA78522943C1B283CF7B89E69&nonce=6BA36BBB1F623279&cnonce=8257070573EFE28F"
  s=requests.session()
  r=s.post(urllogin,data=Form,headers=headers)
  r.encoding = r.apparent_encoding
  r.raise_for_status()
  return s
 except Exception as error:
  print(error)
def get_html(s,url):
 try:
  r=s.get(url,headers=headers)
  r.encoding = r.apparent_encoding
  r.raise_for_status()
  return r.text
 except Exception as error:
  print(error)
if __name__=="__main__":
 # 登錄之后的界面user-agent
 headers = {
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36",
 }
 # 跟著自己的改變
 Form = {
  "username": "12608199000635",
  "password": "48C032612C2A6777D28A969307B52127E198D59AA78522943C1B283CF7B89E69",
  "nonce": "6BA36BBB1F623279",
  "cnonce": "8257070573EFE28F"
 }
 lin=login()
 # 個人中心的網(wǎng)址
 url="http://www.cqooc.com/my/learn"
 html=get_html(lin,url)
 print(html)

3.數(shù)據(jù)保存為CSV格式和存入數(shù)據(jù)庫

保存為CSV

import  requests
from lxml import etree
import csv
#獲取數(shù)據(jù)
def get_html(url,time=30):
 try:
  r = requests.get(url, timeout=time)
  r.encoding = r.apparent_encoding
  r.raise_for_status()
  return r.text
 except Exception as error:
  print(error)
def parser(html): #解析函數(shù)
 doc=etree.HTML(html) #html轉(zhuǎn)換為soup對象
 out_list=[] #解析函數(shù)輸出數(shù)據(jù)的列表
 #二次查找法
 for row in  doc.xpath("http://*[@class='book-img-text']//li/*[@class='book-mid-info']"):
  row_data=[
row.xpath("h4/a/text()")[0], #書名
row.xpath("p[@class='author']/a/text()")[0], #作者
row.xpath("p[2]/text()")[0].strip(), #介紹
row.xpath("p[@class='update']/span/text()")[0] #更新日期
  ]
  out_list.append(row_data) #將解析的每行數(shù)據(jù)插入到輸出列表中
 return out_list
def  save_csv(item,path): #數(shù)據(jù)存儲,將list數(shù)據(jù)寫入文件,防止亂碼
 with open(path, "a+", newline='',encoding="utf-8") as f: #創(chuàng)建utf8編碼文件
  csv_write = csv.writer(f) #創(chuàng)建寫入對象
  csv_write.writerows(item) #一次性寫入多行
if __name__=="__main__":
 for i in range(1,6):
  url="https://www.qidian.com/rank/fengyun?style=1&page={0}".format(i)
  html=get_html(url) #獲取網(wǎng)頁數(shù)據(jù)
  out_list=parser(html) #解析網(wǎng)頁,輸出列表數(shù)據(jù)
  save_csv(out_list,"d:\\book.csv") #數(shù)據(jù)存儲

存入數(shù)據(jù)庫

import pymysql
import requests
from lxml import etree
def get_html(url, time=3000):
 try:
  headers ={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31"
  }
  r = requests.get(url, timeout=time,headers=headers)
  r.encoding = r.apparent_encoding
  r.raise_for_status()
  return r.text
 except Exception as err:
  print(err)
result = []
def parse_html(html):
 html = etree.HTML(html)
 for row in html.xpath('//*[@id="content"]/div/div[1]/ul/li'):
  Naame = row.xpath("div[2]/h2/a/text()")[0].strip()#//*[@id="content"]/div/div[1]/ul[1]/div[2]/h2/a
  score = row.xpath("div[2]/p[2]/span[2]/text()")[0].strip()#//*[@id="content"]/div/div[1]/ul[1]/div[2]/p[2]/span[2]
  price = row.xpath("div[2]/p[1]/text()")[0].strip().split("/")#//*[@id="content"]/div/div[1]/ul[1]/div[2]/p[1]/text()
  price= price[0]
  content= price[1]
  a=price[2]
  b= price[-1]
  detail = [Naame,score,price,content,a,b]
  result.append(detail)
def join_all(sql_insert,vals,**dbinfo):
 try:
  connet = pymysql.connect(**dbinfo)
  cursor = connet.cursor()
  cursor.executemany(sql_insert,vals)
  connet.commit()
  print('添加成功!')
 except Exception as err:
  print(err)
  connet.rollback()
 cursor.close()
if __name__=="__main__":
 for page in range(1,16):
  url="https://book.douban.com/latest?subcat=%E5%85%A8%E9%83%A8&p={0}".format(str(page))
  parms ={
"host":"127.0.0.1",
"port":3306,
"user":"root",
"passwd":"123456",
"db":"db",
"charset":"utf8"
  }
  html=get_html(url)
  parse_html(html)
 sql_insert = "INSERT INTO db(Naame,score,price,content,a,b)\Values(%s,%s,%s,%s,%s,%s)"
 join_all(sql_insert,result,**parms)
 print(result)

總結(jié)

本篇文章就到這里了,希望能夠給你帶來幫助,也希望您能夠多多關(guān)注本站的更多內(nèi)容!

版權(quán)聲明:本站文章來源標注為YINGSOO的內(nèi)容版權(quán)均為本站所有,歡迎引用、轉(zhuǎn)載,請保持原文完整并注明來源及原文鏈接。禁止復(fù)制或仿造本網(wǎng)站,禁止在非www.sddonglingsh.com所屬的服務(wù)器上建立鏡像,否則將依法追究法律責(zé)任。本站部分內(nèi)容來源于網(wǎng)友推薦、互聯(lián)網(wǎng)收集整理而來,僅供學(xué)習(xí)參考,不代表本站立場,如有內(nèi)容涉嫌侵權(quán),請聯(lián)系alex-e#qq.com處理。

相關(guān)文章

實時開通

自選配置、實時開通

免備案

全球線路精選!

全天候客戶服務(wù)

7x24全年不間斷在線

專屬顧問服務(wù)

1對1客戶咨詢顧問

在線
客服

在線客服:7*24小時在線

客服
熱線

400-630-3752
7*24小時客服服務(wù)熱線

關(guān)注
微信

關(guān)注官方微信
頂部