人妖在线一区,国产日韩欧美一区二区综合在线,国产啪精品视频网站免费,欧美内射深插日本少妇

新聞動態(tài)

Python數(shù)據(jù)分析之Python和Selenium爬取BOSS直聘崗位

發(fā)布日期:2022-05-06 14:15 | 文章來源:源碼之家

一、數(shù)據(jù)爬取的代碼

#encoding='utf-8'
from selenium import webdriver
import time
import re
import pandas as pd
import os
def close_windows():
 #如果有登錄彈窗,就關(guān)閉
 try:
  time.sleep(0.5)
  if dr.find_element_by_class_name("jconfirm").find_element_by_class_name("closeIcon"):
dr.find_element_by_class_name("jconfirm").find_element_by_class_name("closeIcon").click()
 except BaseException as e:
  print('close_windows,沒有彈窗',e)

def get_current_region_job(k_index):
 flag = 0
 # page_num_set=0#每區(qū)獲取多少條數(shù)據(jù),對30取整
 df_empty = pd.DataFrame(columns=['崗位', '地點', '薪資', '工作經(jīng)驗', '學(xué)歷', '公司', '技能'])
 while (flag == 0):
  # while (page_num_set<151)&(flag == 0):#每次只能獲取150條信息
  time.sleep(0.5)
  close_windows()
  job_list = dr.find_elements_by_class_name("job-primary")
  for job in job_list:#獲取當前頁的職位30條
job_name = job.find_element_by_class_name("job-name").text
# print(job_name)
job_area = job.find_element_by_class_name("job-area").text
salary = job.find_element_by_class_name("red").get_attribute("textContent")  # 獲取薪資
# salary_raw = job.find_element_by_class_name("red").get_attribute("textContent")  # 獲取薪資
# salary_split = salary_raw.split('·')  # 根據(jù)·分割
# salary = salary_split[0]  # 只取薪資,去掉多少薪
# if re.search(r'天', salary):
#  continue
experience_education = job.find_element_by_class_name("job-limit").find_element_by_tag_name(
 "p").get_attribute("innerHTML")
# experience_education_raw = '1-3年<em class="vline"></em>本科'
experience_education_raw = experience_education
split_str = re.search(r'[a-zA-Z =<>/"]{23}', experience_education_raw)  # 搜索分割字符串<em class="vline"></em>
# print(split_str)
experience_education_replace = re.sub(r'[a-zA-Z =<>/"]{23}', ",", experience_education_raw)  # 分割字符串替換為逗號
# print(experience_education_replace)
experience_education_list = experience_education_replace.split(',')  # 根據(jù)逗號分割
# print('experience_education_list:',experience_education_list)
if len(experience_education_list)!=2:
 print('experience_education_list不是2個,跳過該數(shù)據(jù)',experience_education_list)
 break
experience = experience_education_list[0]
education = experience_education_list[1]
# print(experience)
# print(education)

company = job.find_element_by_class_name("company-text").find_element_by_class_name("name").text
skill_list = job.find_element_by_class_name("tags").find_elements_by_class_name("tag-item")
skill = []
for skill_i in skill_list:
 skill_i_text = skill_i.text
 if len(skill_i_text) == 0:
  continue
 skill.append(skill_i_text)
# print(job_name)
# print(skill)
df_empty.loc[k_index, :] = [job_name, job_area, salary, experience, education, company, skill]
k_index = k_index + 1
# page_num_set=page_num_set+1
print("已經(jīng)讀取數(shù)據(jù){}條".format(k_index))
  close_windows()
  try:#點擊下一頁
cur_page_num=dr.find_element_by_class_name("page").find_element_by_class_name("cur").text
# print('cur_page_num',cur_page_num)
#點擊下一頁
element = dr.find_element_by_class_name("page").find_element_by_class_name("next")
dr.execute_script("arguments[0].click();", element)
time.sleep(1)
# print('點擊下一頁')
new_page_num=dr.find_element_by_class_name("page").find_element_by_class_name("cur").text
# print('new_page_num',new_page_num)
if cur_page_num==new_page_num:
 flag = 1
 break
  except BaseException as e:
print('點擊下一頁錯誤',e)
break
 print(df_empty)
 if os.path.exists("數(shù)據(jù).csv"):#存在追加,不存在創(chuàng)建
  df_empty.to_csv('數(shù)據(jù).csv', mode='a', header=False, index=None, encoding='gb18030')
 else:
  df_empty.to_csv("數(shù)據(jù).csv", index=False, encoding='gb18030')
 return k_index

def main():
 # 打開瀏覽器
 # dr = webdriver.Firefox()
 global dr
 dr = webdriver.Chrome()
 # dr = webdriver.Ie()
 # # 后臺打開瀏覽器
 # option=webdriver.ChromeOptions()
 # option.add_argument('headless')
 # dr = webdriver.Chrome(chrome_options=option)
 # print("打開瀏覽器")
 # 將瀏覽器最大化顯示
 dr.maximize_window()
 # 轉(zhuǎn)到目標網(wǎng)址
 # dr.get("https://www.zhipin.com/job_detail/?query=Python&city=100010000&industry=&position=")#全國
 dr.get("https://www.zhipin.com/c101010100/?query=Python&ka=sel-city-101010100")#北京
 print("打開網(wǎng)址")
 time.sleep(5)
 k_index = 0#數(shù)據(jù)條數(shù)、DataFrame索引
 flag_hot_city=0
 for i in range(3,17,1):
  # print('第',i-2,'頁')
  # try:
  # 獲取城市
  close_windows()
  hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
  close_windows()
  # hot_city_list[i].click()#防止彈窗,改為下面兩句
  # element_hot_city_list_first = hot_city_list[i]
  dr.execute_script("arguments[0].click();", hot_city_list[i])
  # 輸出城市名
  close_windows()
  hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
  print('城市:{}'.format(i-2),hot_city_list[i].text)
  time.sleep(0.5)

  # 獲取區(qū)縣
  for j in range(1,50,1):
# print('第', j , '個區(qū)域')
# try:
# close_windows()
# hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
# 在這個for循環(huán)點一下城市,不然識別不到當前頁面已經(jīng)更新了
close_windows()
hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
close_windows()
# hot_city_list[i].click()#防止彈窗,改為下面
dr.execute_script("arguments[0].click();", hot_city_list[i])
#輸出區(qū)縣名稱
close_windows()
city_district = dr.find_element_by_class_name("condition-district").find_elements_by_tag_name("a")
if len(city_district)==j:
 print('遍歷完所有區(qū)縣,沒有不可點擊的,跳轉(zhuǎn)下一個城市')
 break
print('區(qū)縣:',j, city_district[j].text)
# city_district_value=city_district[j].text#當前頁面的區(qū)縣值

# 點擊區(qū)縣
close_windows()
city_district=  dr.find_element_by_class_name("condition-district").find_elements_by_tag_name("a")
close_windows()
# city_district[j].click()]#防止彈窗,改為下面兩句
# element_city_district = city_district[j]
dr.execute_script("arguments[0].click();", city_district[j])

#判斷區(qū)縣是不是點完了
close_windows()
hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
print('點擊后這里應(yīng)該是區(qū)縣', hot_city_list[1].text)#如果是不限,說明點完了,跳出
hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
print('如果點完了,這里應(yīng)該是不限:',hot_city_list[1].text)
hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
if hot_city_list[1].text == '不限':
 print('當前區(qū)縣已經(jīng)點完了,點擊下一個城市')
 flag_hot_city=1
 break

close_windows()
k_index = get_current_region_job(k_index)#獲取職位,爬取數(shù)據(jù)

# 重新點回城市頁面,再次獲取區(qū)縣。但此時多了區(qū)縣,所以i+1
close_windows()
hot_city_list = dr.find_element_by_class_name("condition-city").find_elements_by_tag_name("a")
close_windows()
# hot_city_list[i+1].click()#防止彈窗,改為下面兩句
# element_hot_city_list_again = hot_city_list[i+1]
dr.execute_script("arguments[0].click();", hot_city_list[i+1])

# except BaseException as e:
#  print('main的j循環(huán)-獲取區(qū)縣發(fā)生錯誤:', e)
#  close_windows()
time.sleep(0.5)

  # except BaseException as e:
  #  print('main的i循環(huán)發(fā)生錯誤:',e)
  #  close_windows()
  time.sleep(0.5)
 # 退出瀏覽器
 dr.quit()
 # p1.close()

if __name__ == '__main__':
 main()

二、獲取到的數(shù)據(jù)如圖所示

三、數(shù)據(jù)分析的代碼

# coding=utf-8
import collections
import wordcloud
import re
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']  # 顯示中文標簽
plt.rcParams['axes.unicode_minus'] = False  # 設(shè)置正常顯示符號

def create_dir_not_exist(path):  # 判斷文件夾是否存在,不存在-新建
 if not os.path.exists(path):
  os.mkdir(path)

create_dir_not_exist(r'./image')
create_dir_not_exist(r'./image/city')
data = pd.read_csv('數(shù)據(jù).csv', encoding='gb18030')
data_df = pd.DataFrame(data)
print("\n查看是否有缺失值\n", data_df.isnull().sum())
data_df_del_empty = data_df.dropna(subset=['崗位'], axis=0)
# print("\n刪除缺失值‘崗位'的整行\(zhòng)n",data_df_del_empty)
data_df_del_empty = data_df_del_empty.dropna(subset=['公司'], axis=0)
# print("\n刪除缺失值‘公司'的整行\(zhòng)n",data_df_del_empty)
print("\n查看是否有缺失值\n", data_df_del_empty.isnull().sum())
print('去除缺失值后\n', data_df_del_empty)
data_df_python_keyword = data_df_del_empty.loc[data_df_del_empty['崗位'].str.contains('Python|python')]
# print(data_df_python_keyword)#篩選帶有python的行
# 區(qū)間最小薪資
data_df_python_keyword_salary = data_df_python_keyword['薪資'].str.split('-', expand=True)[0]
print(data_df_python_keyword_salary)  # 區(qū)間最小薪資
# Dataframe新增一列  在第 列新增一列名為' ' 的一列 數(shù)據(jù)
data_df_python_keyword.insert(7, '區(qū)間最小薪資(K)', data_df_python_keyword_salary)
print(data_df_python_keyword)
# 城市地區(qū)
data_df_python_keyword_location_city = data_df_python_keyword['地點'].str.split('·', expand=True)[0]
print(data_df_python_keyword_location_city)  # 北京
data_df_python_keyword_location_district = data_df_python_keyword['地點'].str.split('·', expand=True)[1]
print(data_df_python_keyword_location_district)  # 海淀區(qū)
data_df_python_keyword_location_city_district = []
for city, district in zip(data_df_python_keyword_location_city, data_df_python_keyword_location_district):
 city_district = city + district
 data_df_python_keyword_location_city_district.append(city_district)
print(data_df_python_keyword_location_city_district)  # 北京海淀區(qū)
# Dataframe新增一列  在第 列新增一列名為' ' 的一列 數(shù)據(jù)
data_df_python_keyword.insert(8, '城市地區(qū)', data_df_python_keyword_location_city_district)
print(data_df_python_keyword)
data_df_python_keyword.insert(9, '城市', data_df_python_keyword_location_city)
data_df_python_keyword.insert(10, '地區(qū)', data_df_python_keyword_location_district)
data_df_python_keyword.to_csv("data_df_python_keyword.csv", index=False, encoding='gb18030')
print('-------------------------------------------')

def draw_bar(row_lable, title):
 figsize_x = 10
 figsize_y = 6
 global list1_education, list2_education, df1, df2
 plt.figure(figsize=(figsize_x, figsize_y))
 list1_education = []
 list2_education = []
 for df1, df2 in data_df_python_keyword.groupby(row_lable):
  list1_education.append(df1)
  list2_education.append(len(df2))
 # print(list1_education)
 # print(list2_education)
 # 利用 * 解包方式 將 一個排序好的元組,通過元組生成器再轉(zhuǎn)成list
 # print(*sorted(zip(list2_education,list1_education)))
 # print(sorted(zip(list2_education,list1_education)))
 # 排序,兩個列表對應(yīng)原始排序,按第幾個列表排序,注意先后位置
 list2_education, list1_education = (list(t) for t in zip(*sorted(zip(list2_education, list1_education))))
 plt.bar(list1_education, list2_education)
 plt.title('{}'.format(title))
 plt.savefig('./image/{}分析.jpg'.format(title))
 # plt.show()
 plt.close()

# 學(xué)歷
draw_bar('學(xué)歷', '學(xué)歷')
draw_bar('工作經(jīng)驗', '工作經(jīng)驗')
draw_bar('區(qū)間最小薪資(K)', '14個熱門城市的薪資分布情況(K)')
# -----------------------------------------
# 根據(jù)城市地區(qū)求均值
list_group_city1 = []
list_group_city2 = []
for df1, df2 in data_df_python_keyword.groupby(data_df_python_keyword['城市地區(qū)']):
 # print(df1)
 # print(df2)
 list_group_city1.append(df1)
 salary_list_district = [int(i) for i in (df2['區(qū)間最小薪資(K)'].values.tolist())]
 district_salary_mean = round(np.mean(salary_list_district), 2)  # 每個區(qū)縣的平均薪資 round(a, 2)保留2位小數(shù)
 list_group_city2.append(district_salary_mean)
 list_group_city2, list_group_city1 = (list(t) for t in
  zip(*sorted(zip(list_group_city2, list_group_city1), reverse=False)))
#
# print(list_group_city1)
# print(list_group_city2)
plt.figure(figsize=(10, 50))
plt.barh(list_group_city1, list_group_city2)
# 坐標軸上的文字說明
for ax, ay in zip(list_group_city1, list_group_city2):
 # 設(shè)置文字說明 第一、二個參數(shù):坐標軸上的值; 第三個參數(shù):說明文字;ha:垂直對齊方式;va:水平對齊方式
 plt.text(ay, ax, '%.2f' % ay, ha='center', va='bottom')
plt.title('14個熱門城市的各區(qū)縣招聘工資情況(K)')
plt.savefig('./image/14個熱門城市的各區(qū)縣招聘工資情況(K).jpg')
# plt.show()
plt.close()
# -----------------------------------------
# 根據(jù)城市分組排序,
list_group_city11 = []
list_group_city22 = []
list_group_city33 = []
list_group_city44 = []
for df_city1, df_city2 in data_df_python_keyword.groupby(data_df_python_keyword['城市']):
 # print(df_city1)#市
 # print(df_city2)
 list_group_district2 = []  # 區(qū)縣列表
 district_mean_salary2 = []  # 工資均值列表
 for df_district1, df_district2 in df_city2.groupby(data_df_python_keyword['地區(qū)']):
  # print(df_district1)#區(qū)縣
  # print(df_district2)#工作
  list_group_district2.append(df_district1)  # 記錄區(qū)縣
  salary_list_district2 = [int(i) for i in (df_district2['區(qū)間最小薪資(K)'].values.tolist())]  # 工資列表
  district_salary_mean2 = round(np.mean(salary_list_district2), 2)  # 每個區(qū)縣的平均薪資 round(a, 2)保留2位小數(shù)
  district_mean_salary2.append(district_salary_mean2)  # 記錄區(qū)縣的平均工作的列表
 district_mean_salary2, list_group_district2 = (list(tt) for tt in zip(
  *sorted(zip(district_mean_salary2, list_group_district2), reverse=True)))
 plt.figure(figsize=(10, 6))
 plt.bar(list_group_district2, district_mean_salary2)
 # 坐標軸上的文字說明
 for ax, ay in zip(list_group_district2, district_mean_salary2):
  # 設(shè)置文字說明 第一、二個參數(shù):坐標軸上的值; 第三個參數(shù):說明文字;ha:垂直對齊方式;va:水平對齊方式
  plt.text(ax, ay, '%.2f' % ay, ha='center', va='bottom')
 plt.title('14個熱門城市的各區(qū)縣招聘工資情況_{}(K)'.format(df_city1))
 plt.savefig('./image/city/14個熱門城市的各區(qū)縣招聘工資情況_{}(K).jpg'.format(df_city1))
 # plt.show()
 plt.close()
# ----------------------------------------------------

skill_all = data_df_python_keyword['技能']
print(skill_all)
skill_list = []
for i in skill_all:
 # print(type(i))
 print(i)
 # print(i.split(", | ' | \[ | \]  |  \" | "))
 result = re.split(r'[,\' \[, \]  ]', i)
 print(result)
 # if type(i) == list:
 skill_list = skill_list + result
print('++++++++++++++++++++++++++++++++')
# print(skill_list)
list_new = skill_list
# 詞頻統(tǒng)計
word_counts = collections.Counter(list_new)  # 對分詞做詞頻統(tǒng)計
word_counts_top10 = word_counts.most_common(30)  # 獲取前10最高頻的詞
# print (word_counts_top10) # 輸出檢查
# print (word_counts_top10[0][0]) # 輸出檢查
# 生成柱狀圖
list_x = []
list_y = []
for i in word_counts_top10:
 list_x.append(i[0])
 list_y.append(i[1])
print('list_x', list_x[1:])
print('list_y', list_y[1:])
plt.figure(figsize=(30, 5))
plt.bar(list_x[1:], list_y[1:])
plt.savefig('./image/技能棧_詞頻_柱狀圖.png')
# plt.show()
plt.close()
list_new = " ".join(list_new)  # 列表轉(zhuǎn)字符串,以空格間隔
# print(list_new)

wc = wordcloud.WordCloud(
 width=800,
 height=600,
 background_color="#ffffff",  # 設(shè)置背景顏色
 max_words=50,  # 詞的最大數(shù)(默認為200)
 max_font_size=60,  # 最大字體尺寸
 min_font_size=10,  # 最小字體尺寸(默認為4)
 # colormap='bone',  # string or matplotlib colormap, default="viridis"
 colormap='hsv',  # string or matplotlib colormap, default="viridis"
 random_state=20,  # 設(shè)置有多少種隨機生成狀態(tài),即有多少種配色方案
 # mask=plt.imread("mask2.gif"),  # 讀取遮罩圖片?。? font_path='simhei.ttf'
)
my_wordcloud = wc.generate(list_new)
plt.imshow(my_wordcloud)
plt.axis("off")
# plt.show()
wc.to_file('./image/技能棧_詞云.png')  # 保存圖片文件
plt.close()

四、學(xué)歷分析

五、工作經(jīng)驗分析

六、14個熱門城市的各區(qū)縣招聘薪資情況

七、各城市各區(qū)縣的薪資情況

北京

上海

其余12個城市不再展示,生成代碼都一樣

八、技能棧


到此這篇關(guān)于Python數(shù)據(jù)分析之Python和Selenium爬取BOSS直聘崗位的文章就介紹到這了,更多相關(guān)Python和Selenium爬取BOSS直聘內(nèi)容請搜索本站以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持本站!

香港服務(wù)器租用

版權(quán)聲明:本站文章來源標注為YINGSOO的內(nèi)容版權(quán)均為本站所有,歡迎引用、轉(zhuǎn)載,請保持原文完整并注明來源及原文鏈接。禁止復(fù)制或仿造本網(wǎng)站,禁止在非www.sddonglingsh.com所屬的服務(wù)器上建立鏡像,否則將依法追究法律責(zé)任。本站部分內(nèi)容來源于網(wǎng)友推薦、互聯(lián)網(wǎng)收集整理而來,僅供學(xué)習(xí)參考,不代表本站立場,如有內(nèi)容涉嫌侵權(quán),請聯(lián)系alex-e#qq.com處理。

相關(guān)文章

實時開通

自選配置、實時開通

免備案

全球線路精選!

全天候客戶服務(wù)

7x24全年不間斷在線

專屬顧問服務(wù)

1對1客戶咨詢顧問

在線
客服

在線客服:7*24小時在線

客服
熱線

400-630-3752
7*24小時客服服務(wù)熱線

關(guān)注
微信

關(guān)注官方微信
頂部