Press "Enter" to skip to content

Python爬虫与数据可视化

1.数据挖掘

 

代码所需包

 

import urllib.request

 

import xlwt

 

import re

 

import urllib.parse

 

import time

 

进入前程无忧官网

 

我这里以搜索大数据职位信息

 

 

打开开发者模式

 

Request Headers 里面是我们用浏览器访问网站的信息,有了信息后就能模拟浏览器访问

 

这也是为了防止网站封禁IP,不过前程无忧一般是不会封IP的。

 

 

模拟浏览器

 

header={
 'Host':'search.51job.com',
 'Upgrade-Insecure-Requests':'1',
 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
 }

 

 

这些基本数据都可以爬取:

 

为了实现交互型爬取,我写了一个能够实现输入想了解的职位就能爬取相关内容的函数

 

def getfront(page,item): #page是页数,
item是输入的字符串,见后文 result = urllib.parse.quote(item) #先把字符串
转成十六进制编码
 ur1 = result+',2,'+ str(page)+'.html'
 ur2 = 'https://search.51job.com/list
/000000,000000,0000,00,9,99,'
 res = ur2+ur1 #拼接网址
 a = urllib.request.urlopen(res)
 html = a.read().decode('gbk') 
# 读取源代码并转为unicode
 return html

 

def getInformation(html):
 reg = re.compile(r'class="t1 ">.*? <a target="_blank" 
 title="(.*?)" href="(.*?)".*? <span><
 a target="_blank" title="(.*?)" href="(.*?)".*?
<span>(.*?)</span>.*?<span>(
.*?)</span>.*?<span>(.*?)</span>.*?',re.S)
#匹配换行符
 items=re.findall(reg,html)
 return items

 

这里我除了爬取图上信息外,还把职位超链接后的网址,以及公司超链接的网址爬取下来了。

 

这里先不讲,后面后面会说到,

 

接下来就需要储存信息,这里使用Excel,虽然比较麻烦,不过胜在清晰直观

 

#新建表格空间
 excel1 = xlwt.Workbook()
 # 设置单元格格式
 sheet1 = excel1.add_sheet('Job', cell_overwrite_ok=True)
 sheet1.write(0, 0, '序号')
 sheet1.write(0, 1, '职位')
 sheet1.write(0, 2, '公司名称')
 sheet1.write(0, 3, '公司地点')
 sheet1.write(0, 4, '公司性质')
 sheet1.write(0, 5, '薪资')
 sheet1.write(0, 6, '学历要求')
 sheet1.write(0, 7, '工作经验')
 sheet1.write(0, 8, '公司规模')
 sheet1.write(0, 9, '公司类型')
 sheet1.write(0, 10,'公司福利')
 sheet1.write(0, 11,'发布时间')

 

爬取代码如下,这里就能利用双层循环来实现换页爬取与换行输出

 

我这里为了获得大量数据所以爬取了1000页,调试时可以只爬取几页

 

number = 1
 item = input()
 for j in range(1,10000): #页数自己随便改
 try:
 print("正在爬取第"+str(j)+"页数据...")
 html = getfront(j,item) #调用获取网页原码
 for i in getInformation(html):
 try:
 url1 = i[1] #职位网址
 res1 = urllib.request.urlopen(url1).read().
decode('gbk') company = re.findall(re.compile
(r'<div>.
*?<p title="(.*?)"><span>.*?
<p title="(.*?)">.*?<p 
title="(.*?)
">.*?',re.S),res1)
 job_need = re.findall(re.compile(r'<p.*?>
.*?  <span>|</span>  
(.*?)  <span>|</span>  
(.*?)  
<span>|</span>  .*?</p>',re.S),res1)
 welfare = re.findall(re.compile(r'
<span>
(.*?)
</span>',re.S),res1)
 print(i[0],i[2],i[4],i[5],company[0][0],
job_need[2][0],
job_need[1][0],company[0][1],company[0][2],
welfare,i[6])
 sheet1.write(number,0,number)
 sheet1.write(number,1,i[0])
 sheet1.write(number,2,i[2])
 sheet1.write(number,3,i[4])
 sheet1.write(number,4,company[0][0])
 sheet1.write(number,5,i[5])
 sheet1.write(number,6,job_need[1][0])
 sheet1.write(number,7,job_need[2][0])
 sheet1.write(number,8,company[0][1])
 sheet1.write(number,9,company[0][2])
 sheet1.write(number,10,(" ".join(str(i) for 
i in welfare)))
 sheet1.write(number,11,i[6])
 number+=1
 excel1.save("51job.xls")
 time.sleep(0.3) #休息间隔,避免爬取海量数据
时被误判为攻击,IP遭到封禁
 except:
 pass
 except:
 pass

 

结果如下:

 

 

 

2.数据清洗

 

首先要打开文件

 

#coding:utf-8
 import pandas as pd
 import re
 #除此之外还要安装xlrd包
 
data = pd.read_excel(r'51job.xls',sheet_name='Job')
 result = pd.DataFrame(data)

 

清洗思路:

 

1、出现有空值(NAN)得信息,直接删除整行

 

a = result.dropna(axis=0,how=’any’)

 

pd.set_option(‘display.max_rows’,None) #输出全部行,不省略

 

2、职位出错(很多职位都是与大数据无关的职业)

 

 

b = u'数据'
 number = 1
 li = a['职位']
 for i in range(0,len(li)):
 try:
 if b in li[i]:
 #print(number,li[i])
 number+=1
 else:
 a = a.drop(i,axis=0)
 except:
 pass

 

3、其他地方出现的信息错位,比如在学历里出现 ‘招多少人’

 

 

b2= u'人'
 li2 = a['学历要求']
 for i in range(0,len(li2)):
 try:
 if b2 in li2[i]:
 #print(number,li2[i])
 number+=1
 a = a.drop(i,axis=0)
 except:
 pass

 

4、转换薪资单位

 

如上图就出现单位不一致的情况

 

b3 =u'万/年'
 b4 =u'千/月'
 li3 = a['薪资']
 #注释部分的print都是为了调试用的
 for i in range(0,len(li3)):
 try:
 if b3 in li3[i]:
 x = re.findall(r'\d*\.?\d+',li3[i])
 #print(x)
 min_ = format(float(x[0])/12,'.2f') #转换成浮点型并保留两位小数
 max_ = format(float(x[1])/12,'.2f')
 li3[i][1] = min_+'-'+max_+u'万/月'
 if b4 in li3[i]:
 x = re.findall(r'\d*\.?\d+',li3[i])
 #print(x)
 #input()
 min_ = format(float(x[0])/10,'.2f')
 max_ = format(float(x[1])/10,'.2f')
 li3[i][1] = str(min_+'-'+max_+'万/月')
 print(i,li3[i])
 except:
 pass

 

保存到另一个Excel文件

a.to_excel(’51job2.xls’, sheet_name=’Job’, index=False)

这里只是简单的介绍了一些数据清理的思路,并不是说只要清理这些就行了

 

有时候有的公司网页并不是前程无忧类型的,而是他们公司自己做的网页,这也很容易出错

 

不过只要有了基本思路,这些都不难清理

 

3.数据可视化

 

数据可视化可以说是很重要的环节,如果只是爬取数据而不去可视化处理,那幺可以说数据的价值根本没有发挥

 

可视化处理能使数据更加直观,更有利于分析

 

甚至可以说可视化是数据挖掘最重要的内容

 

同样的我们先看代码需要的包

 

# -*- coding: utf-8 -*-
 import pandas as pd
 import re
 from pyecharts import Funnel,Pie,Geo
 import matplotlib.pyplot as plt

 

这里特别强调,pyecharts包千万别装新版的,我这里装的是0.5.9版的

 

 

其次如果要做地理坐标图,热力图啥的,必须安装地图包,比如世界地图包,中国地图包,城市地图包啥的

 

 

接下来就是正戏

 

一样的先要打开文件

 

file = pd.read_excel(r’51job2.xls’,sheet_name=’Job’)

 

f = pd.DataFrame(file)

 

pd.set_option(‘display.max_rows’,None)

 

1、创建多个列表来单独存放【‘薪资’】【‘工作经验’】【‘学历要求’】【‘公司地点’】等信息

 

add = f['公司地点']
 sly = f['薪资']
 edu = f['学历要求']
 exp = f['工作经验']
 address =[]
 salary = []
 education = []
 experience = []
 for i in range(0,len(f)):
 try:
 a = add[i].split('-')
 address.append(a[0])
 #print(address[i])
 s = re.findall(r'\d*\.?\d+',sly[i])
 s1= float(s[0])
 s2 =float(s[1])
 salary.append([s1,s2])
 #print(salary[i])
 education.append(edu[i])
 #print(education[i])
 experience.append(exp[i])
 #print(experience[i])
 except:
 pass

 

2、matploblib库生成 工作经验—薪资图 与 学历—薪资图

 

min_s=[] #定义存放最低薪资的列表
 max_s=[] #定义存放最高薪资的列表
 for i in range(0,len(experience)):
 min_s.append(salary[i][0])
 max_s.append(salary[i][0])
 my_df = pd.DataFrame({'experience':experience, 
'min_salay' : min_s, 'max_salay' : max_s}) 
#关联工作经验与薪资
 data1 = my_df.groupby('experience').mean()['min_salay']
.plot(kind='line')
 plt.show()
 my_df2 = pd.DataFrame({'education':education, 
 'min_salay' : min_s, 'max_salay' : max_s})
 #关联学历与薪资
 data2 = my_df2.groupby('education').mean()['min_salay'].
plot(kind='line')
 plt.show()

 

 

 

3、学历要求圆环图

 

def get_edu(list):
 education2 = {}
 for i in set(list):
 education2[i] = list.count(i)
 return education2
 dir1 = get_edu(education)
 # print(dir1)
 attr= dir1.keys()
 value = dir1.values()
 pie = Pie("学历要求")
 pie.add("", attr, value, center=[50, 50], is_random=False, radius=[30, 75], rosetype='radius',
 is_legend_show=False, is_label_show=True,legend_orient='vertical')
 pie.render('学历要求玫瑰图.html')

 

 

 

4、大数据城市需求地理位置分布图

 

def get_address(list):
 address2 = {}
 for i in set(list):
 address2[i] = list.count(i)
 address2.pop('异地招聘')
 # 有些地名可能不合法或者地图包里没有可以自行删除,之前以下名称都会报错,现在好像更新了
 #address2.pop('山东')
 #address2.pop('怒江')
 #address2.pop('池州')
 return address2
 dir2 = get_address(address)
 #print(dir2)
 geo = Geo("大数据人才需求分布图", title_color="#2E2E2E",
 title_text_size=24,title_top=20,title_pos="center", width=1300,height=600)
 attr2 = dir2.keys()
 value2 = dir2.values()
 geo.add("",attr2, value2, type="effectScatter", is_random=True, visual_range=[0, 1000], maptype='china',symbol_size=8, effect_scale=5, is_visualmap=True)
 geo.render('大数据城市需求分布图.html')

 

 

 

5、工作经验要求漏斗图

 

def get_experience(list):
 experience2 = {}
 for i in set(list):
 experience2[i] = list.count(i)
 return experience2
 dir3 = get_experience(experience)
 #print(dir3)
 attr3= dir3.keys()
 value3 = dir3.values()
 funnel = Funnel("工作经验漏斗图",title_pos='center')
 funnel.add("", attr3, value3,is_label_show=True,
label_pos="inside", label_text_color="#fff",legend_orient='vertical',
legend_pos='left')
 funnel.render('工作经验要求漏斗图.html')

 

 

 

当然,pyecharts里面的图还有很多种,就靠大家去自己发掘了。

 

反馈

 

接到部分人反应的乱码情况,主要可能是因为网站规则变动。我去重新更新了一下代码,并且改进了一些地方,如果遇到爬取过程中途停下的情况,可能是网络问题或者陷入阻塞,可以重新运行一次代码

 

所有代码如下:

 

# -*- coding:utf-8 -*-
 import urllib.request
 import xlwt
 import re
 import urllib.parse
 import time
 header={
 'Host':'search.51job.com',
 'Upgrade-Insecure-Requests':'1',
 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; 
Win64; x64) >AppleWebKit/537.36 (KHTML, 
like Gecko) 
Chrome/78.0.3904.108 Safari/537.36'
 }
 def getfront(page,item): #page是页数,item是输入
的字符串
 result = urllib.parse.quote(item) #先把字符串
转成十六进制编码
 ur1 = result+',2,'+ str(page)+'.html'
 ur2 = 'https://search.51job.com/list/000000,
000000,0000,00,9,99,'
 res = ur2+ur1 #拼接网址
 a = urllib.request.urlopen(res)
 html = a.read().decode('gbk') # 读取源代码
并转为unicode
 return html
 def getInformation(html):
 reg = re.compile(r'class="t1 ">.*? 
<a target="_blank" 
title="(.*?)" href="(.*?)".*? <span>
<a target="_blank" 
title="(.*?)" href="(.*?)".*?<span>
(.*?)</span>.
*?<span>(.*?)</span>.*?<span>(.*?)</span>.*?'
,re.S)#匹配换行符
 items=re.findall(reg,html)
 return items
 #新建表格空间
 excel1 = xlwt.Workbook()
 # 设置单元格格式
 sheet1 = excel1.add_sheet('Job', 
cell_overwrite_ok=True)
 sheet1.write(0, 0, '序号')
 sheet1.write(0, 1, '职位')
 sheet1.write(0, 2, '公司名称')
 sheet1.write(0, 3, '公司地点')
 sheet1.write(0, 4, '公司性质')
 sheet1.write(0, 5, '薪资')
 sheet1.write(0, 6, '学历要求')
 sheet1.write(0, 7, '工作经验')
 sheet1.write(0, 8, '公司规模')
 sheet1.write(0, 9, '公司类型')
 sheet1.write(0, 10,'公司福利')
 sheet1.write(0, 11,'发布时间')
 number = 1
 item = input()
 for j in range(1,10000): #页数自己随便改
 try:
 print("正在爬取第"+str(j)+"页数据...")
 html = getfront(j,item) #调用获取网页原码
 for i in getInformation(html):
 try:
 url1 = i[1] #职位网址
 res1 = urllib.request.urlopen(url1).read().
decode('gbk')
 company = re.findall(re.compile(r'<div class=
"com_tag">
.*?<p title="(.*?)"><span
>.*?
<p title="(.*?)">.*?<p 
title="(.*?)">
.*?',re.S),res1)
 job_need = re.findall(re.compile(r'<p class=
"msg ltype".*?>
.*?  <span>|</span>  
(.*?)  <span>|</span>  
(.*?)  <span>|</span>  
.*?</p>',re.S),res1)
 welfare = re.findall(re.compile(r'<span>(.*?)</span>',re.S),res1)
 print(i[0],i[2],i[4],i[5],company[0][0],
job_need[2][0],job_need[1][0],company[0][1],
company[0][2],welfare,i[6])
 sheet1.write(number,0,number)
 sheet1.write(number,1,i[0])
 sheet1.write(number,2,i[2])
 sheet1.write(number,3,i[4])
 sheet1.write(number,4,company[0][0])
 sheet1.write(number,5,i[5])
 sheet1.write(number,6,job_need[2][0])
 sheet1.write(number,7,job_need[1][0])
 sheet1.write(number,8,company[0][1])
 sheet1.write(number,9,company[0][2])
 sheet1.write(number,10,(" ".join(str(i) for i in welfare)))
 sheet1.write(number,11,i[6])
 number+=1
 excel1.save("51job.xls")
 time.sleep(0.3) #休息间隔,避免爬取
海量数据时被误判为攻击,IP遭到封禁
 except:
 pass
 except:
 pass

 

#coding:utf-8
 import pandas as pd
 import re
 data = pd.read_excel(r'51job.xls',sheet_name='Job')
 result = pd.DataFrame(data)
 a = result.dropna(axis=0,how='any')
 pd.set_option('display.max_rows',None)
 #输出全部行,不省略 b = u'数据'
 number = 1
 li = a['职位']
 for i in range(0,len(li)):
 try:
 if b in li[i]:
 #print(number,li[i])
 number+=1
 else:
 a = a.drop(i,axis=0) #删除整行
 except:
 pass
 b2 = '人'
 li2 = a['学历要求']
 for i in range(0,len(li2)):
 try:
 if b2 in li2[i]:
 # print(number,li2[i])
 number += 1
 a = a.drop(i, axis=0)
 except:
 pass
 b3 =u'万/年'
 b4 =u'千/月'
 li3 = a['薪资']
 #注释部分的print都是为了调试用的
 for i in range(0,len(li3)):
 try:
 if b3 in li3[i]:
 x = re.findall(r'\d*\.?\d+',li3[i])
 #print(x)
 min_ = format(float(x[0])/12,'.2f') 
#转换成浮点型并保留两位小数
 max_ = format(float(x[1])/12,'.2f')
 li3[i][1] = min_+'-'+max_+u'万/月'
 if b4 in li3[i]:
 x = re.findall(r'\d*\.?\d+',li3[i])
 #print(x)
 #input()
 min_ = format(float(x[0])/10,'.2f')
 max_ = format(float(x[1])/10,'.2f')
 li3[i][1] = str(min_+'-'+max_+'万/月')
 print(i,li3[i])
 except:
 pass
 a.to_excel('51job2.xls', sheet_name='Job', index=False)
 
 import pandas as pd
 import re
 from pyecharts import Funnel,Pie,Geo
 import matplotlib.pyplot as plt
 file = pd.read_excel(r'51job2.xls',sheet_name='Job')
 f = pd.DataFrame(file)
 pd.set_option('display.max_rows',None)
 add = f['公司地点']
 sly = f['薪资']
 edu = f['学历要求']
 exp = f['工作经验']
 address =[]
 salary = []
 education = []
 experience = []
 for i in range(0,len(f)):
 try:
 a = add[i].split('-')
 address.append(a[0])
 #print(address[i])
 s = re.findall(r'\d*\.?\d+',sly[i])
 s1= float(s[0])
 s2 =float(s[1])
 salary.append([s1,s2])
 #print(salary[i])
 education.append(edu[i])
 #print(education[i])
 experience.append(exp[i])
 #print(experience[i])
 except:
 pass
 min_s=[] #定义存放最低薪资的列表
 max_s=[] #定义存放最高薪资的列表
 for i in range(0,len(experience)):
 min_s.append(salary[i][0])
 max_s.append(salary[i][0])
 #matplotlib模块如果显示不了中文字符串可以用以下代码。
 plt.rcParams['font.sans-serif'] = ['KaiTi'] # 
指定默认字体
 plt.rcParams['axes.unicode_minus'] = False
 # 解决保存图像是负号'-'显示为方块的问题
 my_df = pd.DataFrame({'experience':experience, 
'min_salay' : min_s, 'max_salay' : max_s}) 
#关联工作经验与薪资
 data1 = my_df.groupby('experience').mean()
['min_salay'].plot(kind='line')
 plt.show()
 my_df2 = pd.DataFrame({'education':education, 
 'min_salay' : min_s, 'max_salay' : max_s}) 
#关联学历与薪资
 data2 = my_df2.groupby('education').mean()
['min_salay'].plot(kind='line')
 plt.show()
 def get_edu(list):
 education2 = {}
 for i in set(list):
 education2[i] = list.count(i)
 return education2
 dir1 = get_edu(education)
 # print(dir1)
 attr= dir1.keys()
 value = dir1.values()
 pie = Pie("学历要求")
 pie.add("", attr, value, center=[50, 50], 
is_random=False, radius=[30, 75], rosetype='radius',
 is_legend_show=False, is_label_show=True,
legend_orient='vertical')
 pie.render('学历要求玫瑰图.html')
 def get_address(list):
 address2 = {}
 for i in set(list):
 address2[i] = list.count(i)
 address2.pop('异地招聘')
 # 有些地名可能不合法或者地图包里没有可以自行删除,
之前以下名称都会报错,现在好像更新了
 #address2.pop('山东')
 #address2.pop('怒江')
 #address2.pop('池州')
 return address2
 dir2 = get_address(address)
 #print(dir2)
 geo = Geo("大数据人才需求分布图", title_color="#2E2E2E",
 title_text_size=24,title_top=20,title_pos="center", 
 width=1300,height=600)
 attr2 = dir2.keys()
 value2 = dir2.values()
 geo.add("",attr2, value2, type="effectScatter",
 is_random=True, visual_range=[0, 1000], maptype='china',
symbol_size=8, effect_scale=5, is_visualmap=True)
 geo.render('大数据城市需求分布图.html')
 def get_experience(list):
 experience2 = {}
 for i in set(list):
 experience2[i] = list.count(i)
 return experience2
 dir3 = get_experience(experience)
 #print(dir3)
 attr3= dir3.keys()
 value3 = dir3.values()
 funnel = Funnel("工作经验漏斗图",title_pos='center')
 funnel.add("", attr3, value3,is_label_show=True,
label_pos="inside", label_text_color="#fff",legend_orient='vertical',
legend_pos='left')
 funnel.render('工作经验要求漏斗图.html')

 

 

HTML文件最好用谷歌浏览器打开,如果点开没反应可以在文件夹里找到该文件然后打开

 

Be First to Comment

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注