Python爬蟲入門案例之爬取二手房源數(shù)據(jù)
本文重點(diǎn)
- 系統(tǒng)分析網(wǎng)頁(yè)性質(zhì)
- 結(jié)構(gòu)化的數(shù)據(jù)解析
- csv數(shù)據(jù)保存
環(huán)境介紹
- python 3.8
- pycharm 專業(yè)版 >>> 激活碼
#模塊使用
- requests >>> pip install requests
- parsel >>> pip install parsel
- csv
【付費(fèi)VIP完整版】只要看了就能學(xué)會(huì)的教程,80集Python基礎(chǔ)入門視頻教學(xué)
點(diǎn)這里即可免費(fèi)在線觀看
爬蟲代碼實(shí)現(xiàn)步驟: 發(fā)送請(qǐng)求 >>> 獲取數(shù)據(jù) >>> 解析數(shù)據(jù) >>> 保存數(shù)據(jù)
導(dǎo)入模塊
import requests # 數(shù)據(jù)請(qǐng)求模塊 第三方模塊 pip install requests import parsel # 數(shù)據(jù)解析模塊 import re import csv
發(fā)送請(qǐng)求, 對(duì)于房源列表頁(yè)發(fā)送請(qǐng)求
url = 'https://bj.lianjia.com/ershoufang/pg1/' # 需要攜帶上 請(qǐng)求頭: 把python代碼偽裝成瀏覽器 對(duì)于服務(wù)器發(fā)送請(qǐng)求 # User-Agent 瀏覽器的基本信息 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36' } response = requests.get(url=url, headers=headers)
獲取數(shù)據(jù)
print(response.text)
解析數(shù)據(jù)
selector_1 = parsel.Selector(response.text) # 把獲取到response.text 數(shù)據(jù)內(nèi)容轉(zhuǎn)成 selector 對(duì)象 href = selector_1.css('div.leftContent li div.title a::attr(href)').getall() for link in href: html_data = requests.get(url=link, headers=headers).text selector = parsel.Selector(html_data) # css選擇器 語(yǔ)法 # try: title = selector.css('.title h1::text').get() # 標(biāo)題 area = selector.css('.areaName .info a:nth-child(1)::text').get() # 區(qū)域 community_name = selector.css('.communityName .info::text').get() # 小區(qū) room = selector.css('.room .mainInfo::text').get() # 戶型 room_type = selector.css('.type .mainInfo::text').get() # 朝向 height = selector.css('.room .subInfo::text').get().split('/')[-1] # 樓層 # 中樓層/共5層 split('/') 進(jìn)行字符串分割 ['中樓層', '共5層'] [-1] # ['中樓層', '共5層'][-1] 列表索引位置取值 取列表中最后一個(gè)元素 共5層 # re.findall('共(\d+)層', 共5層) >>> [5][0] >>> 5 height = re.findall('共(\d+)層', height)[0] sub_info = selector.css('.type .subInfo::text').get().split('/')[-1] # 裝修 Elevator = selector.css('.content li:nth-child(12)::text').get() # 電梯 # if Elevator == '暫無(wú)數(shù)據(jù)電梯' or Elevator == None: # Elevator = '無(wú)電梯' house_area = selector.css('.content li:nth-child(3)::text').get().replace('㎡', '') # 面積 price = selector.css('.price .total::text').get() # 價(jià)格(萬(wàn)元) date = selector.css('.area .subInfo::text').get().replace('年建', '') # 年份 dit = { '標(biāo)題': title, '市區(qū)': area, '小區(qū)': community_name, '戶型': room, '朝向': room_type, '樓層': height, '裝修情況': sub_info, '電梯': Elevator, '面積(㎡)': house_area, '價(jià)格(萬(wàn)元)': price, '年份': date, } csv_writer.writerow(dit) print(title, area, community_name, room, room_type, height, sub_info, Elevator, house_area, price, date, sep='|')
保存數(shù)據(jù)
f = open('二手房數(shù)據(jù).csv', mode='a', encoding='utf-8', newline='') csv_writer = csv.DictWriter(f, fieldnames=[ '標(biāo)題', '市區(qū)', '小區(qū)', '戶型', '朝向', '樓層', '裝修情況', '電梯', '面積(㎡)', '價(jià)格(萬(wàn)元)', '年份', ]) csv_writer.writeheader()
數(shù)據(jù)可視化
導(dǎo)入所需模塊
import pandas as pd from pyecharts.charts import Map from pyecharts.charts import Bar from pyecharts.charts import Line from pyecharts.charts import Grid from pyecharts.charts import Pie from pyecharts.charts import Scatter from pyecharts import options as opts
讀取數(shù)據(jù)
df = pd.read_csv('鏈家.csv', encoding = 'utf-8') df.head()
各城區(qū)二手房數(shù)量北京市地圖
new = [x + '區(qū)' for x in region] m = ( Map() .add('', [list(z) for z in zip(new, count)], '北京') .set_global_opts( title_opts=opts.TitleOpts(title='北京市二手房各區(qū)分布'), visualmap_opts=opts.VisualMapOpts(max_=3000), ) ) m.render_notebook()
各城區(qū)二手房數(shù)量-平均價(jià)格柱狀圖
df_price.values.tolist() price = [round(x,2) for x in df_price.values.tolist()] bar = ( Bar() .add_xaxis(region) .add_yaxis('數(shù)量', count, label_opts=opts.LabelOpts(is_show=True)) .extend_axis( yaxis=opts.AxisOpts( name="價(jià)格(萬(wàn)元)", type_="value", min_=200, max_=900, interval=100, axislabel_opts=opts.LabelOpts(formatter="{value}"), ) ) .set_global_opts( title_opts=opts.TitleOpts(title='各城區(qū)二手房數(shù)量-平均價(jià)格柱狀圖'), tooltip_opts=opts.TooltipOpts( is_show=True, trigger="axis", axis_pointer_type="cross" ), xaxis_opts=opts.AxisOpts( type_="category", axispointer_opts=opts.AxisPointerOpts(is_show=True, type_="shadow"), ), yaxis_opts=opts.AxisOpts(name='數(shù)量', axistick_opts=opts.AxisTickOpts(is_show=True), splitline_opts=opts.SplitLineOpts(is_show=False),) ) ) line2 = ( Line() .add_xaxis(xaxis_data=region) .add_yaxis( series_name="價(jià)格", yaxis_index=1, y_axis=price, label_opts=opts.LabelOpts(is_show=True), z=10 ) ) bar.overlap(line2) grid = Grid() grid.add(bar, opts.GridOpts(pos_left="5%", pos_right="20%"), is_control_axis_index=True) grid.render_notebook()
area0 = top_price['小區(qū)'].values.tolist() count = top_price['價(jià)格(萬(wàn)元)'].values.tolist() bar = ( Bar() .add_xaxis(area0) .add_yaxis('數(shù)量', count,category_gap = '50%') .set_global_opts( yaxis_opts=opts.AxisOpts(name='價(jià)格(萬(wàn)元)'), xaxis_opts=opts.AxisOpts(name='數(shù)量'), ) ) bar.render_notebook()
散點(diǎn)圖
s = ( Scatter() .add_xaxis(df['面積(㎡)'].values.tolist()) .add_yaxis('',df['價(jià)格(萬(wàn)元)'].values.tolist()) .set_global_opts(xaxis_opts=opts.AxisOpts(type_='value')) ) s.render_notebook()
房屋朝向占比
directions = df_direction.index.tolist() count = df_direction.values.tolist() c1 = ( Pie(init_opts=opts.InitOpts( width='800px', height='600px', ) ) .add( '', [list(z) for z in zip(directions, count)], radius=['20%', '60%'], center=['40%', '50%'], #rosetype="radius", label_opts=opts.LabelOpts(is_show=True), ) .set_global_opts(title_opts=opts.TitleOpts(title='房屋朝向占比',pos_left='33%',pos_top="5%"),legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%",pos_top="25%",orient="vertical")) .set_series_opts(label_opts=opts.LabelOpts(formatter=':{c} (5bxvrl5%)'),position="outside") ) c1.render_notebook()
裝修情況/有無(wú)電梯玫瑰圖(組合圖)
fitment = df_fitment.index.tolist() count1 = df_fitment.values.tolist() directions = df_direction.index.tolist() count2 = df_direction.values.tolist() bar = ( Bar() .add_xaxis(fitment) .add_yaxis('', count1, category_gap = '50%') .reversal_axis() .set_series_opts(label_opts=opts.LabelOpts(position='right')) .set_global_opts( xaxis_opts=opts.AxisOpts(name='數(shù)量'), title_opts=opts.TitleOpts(title='裝修情況/有無(wú)電梯玫瑰圖(組合圖)',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%",pos_top="58%",orient="vertical") ) ) c2 = ( Pie(init_opts=opts.InitOpts( width='800px', height='600px', ) ) .add( '', [list(z) for z in zip(directions, count2)], radius=['10%', '30%'], center=['75%', '65%'], rosetype="radius", label_opts=opts.LabelOpts(is_show=True), ) .set_global_opts(title_opts=opts.TitleOpts(title='有/無(wú)電梯',pos_left='33%',pos_top="5%"),legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%",pos_top="15%",orient="vertical")) .set_series_opts(label_opts=opts.LabelOpts(formatter=':{c} \n (vljf775%)'),position="outside") ) bar.overlap(c2) bar.render_notebook()
二手房樓層分布柱狀縮放圖
floor = df_floor.index.tolist() count = df_floor.values.tolist() bar = ( Bar() .add_xaxis(floor) .add_yaxis('數(shù)量', count) .set_global_opts( title_opts=opts.TitleOpts(title='二手房樓層分布柱狀縮放圖'), yaxis_opts=opts.AxisOpts(name='數(shù)量'), xaxis_opts=opts.AxisOpts(name='樓層'), datazoom_opts=opts.DataZoomOpts(type_='slider') ) ) bar.render_notebook()
房屋面積分布縱向柱狀圖
area = df_area.index.tolist() count = df_area.values.tolist() bar = ( Bar() .add_xaxis(area) .add_yaxis('數(shù)量', count) .reversal_axis() .set_series_opts(label_opts=opts.LabelOpts(position="right")) .set_global_opts( title_opts=opts.TitleOpts(title='房屋面積分布縱向柱狀圖'), yaxis_opts=opts.AxisOpts(name='面積(㎡)'), xaxis_opts=opts.AxisOpts(name='數(shù)量'), ) ) bar.render_notebook()
到此這篇關(guān)于Python爬蟲入門案例之爬取二手房源數(shù)據(jù)的文章就介紹到這了,更多相關(guān)Python 爬取二手房數(shù)據(jù)內(nèi)容請(qǐng)搜索本站以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持本站!
版權(quán)聲明:本站文章來(lái)源標(biāo)注為YINGSOO的內(nèi)容版權(quán)均為本站所有,歡迎引用、轉(zhuǎn)載,請(qǐng)保持原文完整并注明來(lái)源及原文鏈接。禁止復(fù)制或仿造本網(wǎng)站,禁止在非www.sddonglingsh.com所屬的服務(wù)器上建立鏡像,否則將依法追究法律責(zé)任。本站部分內(nèi)容來(lái)源于網(wǎng)友推薦、互聯(lián)網(wǎng)收集整理而來(lái),僅供學(xué)習(xí)參考,不代表本站立場(chǎng),如有內(nèi)容涉嫌侵權(quán),請(qǐng)聯(lián)系alex-e#qq.com處理。