点击上方“印象python”,选择“星标”公众号
重磅干货,第一时间送达!
今天我们打算爬取一下字节跳动的招聘信息:
参数包装函数
import re
def warp_heareder(s):
print("{")
lines = s.splitlines()
for i, line in enumerate(lines):
k, v = line.split(": ")
if re.search("[a-zA-Z]", k):
k = f'"{k}"'
if re.search("[a-zA-Z]", v):
v = f'"{v}"'
print(f" {k}: {v},")
print("}")
import requests
session = requests.session()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Origin': 'https://jobs.bytedance.com',
'Referer': f'https://jobs.bytedance.com/experienced/position?keywords=&category=&location=&project=&type=&job_hot_flag=¤t=1&limit=10'
}
data = {
"portal_entrance": 1
}
url = "https://jobs.bytedance.com/api/v1/csrf/token"
r = session.post(url, headers=headers, data=data)
r
<Response [200]>
cookies = session.cookies.get_dict()
cookies
{'atsx-csrf-token': 'RDTEznQqdr3O3h9PjRdWjfkSRW79K_G16g85FrXNxm0%3D'}
from urllib.parse import unquote
unquote(cookies['atsx-csrf-token'])
'RDTEznQqdr3O3h9PjRdWjfkSRW79K_G16g85FrXNxm0='
开始爬取第一页的数据
import requests
import json
headers = {
"Accept": "application/json, text/plain, */*",
"Host": "jobs.bytedance.com",
"Origin": "https://jobs.bytedance.com",
"Referer": "https://jobs.bytedance.com/experienced/position?keywords=&category=&location=&project=&type=&job_hot_flag=¤t=1&limit=10",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"x-csrf-token": unquote(cookies['atsx-csrf-token']),
}
data = {
"job_category_id_list": [],
"keyword": "",
"limit": 10,
"location_code_list": [],
"offset": 0,
"portal_entrance": 1,
"portal_type": 2,
"recruitment_id_list": [],
"subject_id_list": []
}
url = "https://jobs.bytedance.com/api/v1/search/job/posts"
r = session.post(url, headers=headers, data=json.dumps(data))
r
<Response [200]>
r.json()
使用Pandas对json数据进行处理
import pandas as pd
df = pd.DataFrame(r.json()['data']['job_post_list'])
df.head(3)
df.city_info = df.city_info.str['name']
df.recruit_type = df.recruit_type.str['parent'].str['name']
tmp = []
for x in df.job_category.values:
if x['parent']:
tmp.append(f"{x['parent']['name']}-{x['name']}")
else:
tmp.append(x['name'])
df.job_category = tmp
df.publish_time = df.publish_time.apply(lambda x: pd.Timestamp(x, unit="ms"))
df.head(2)
df.drop(columns=['sub_title', 'job_hot_flag', 'job_subject'], inplace=True)
df.head()
爬取字节跳动全部职位信息
import requests
from urllib.parse import unquote
import pandas as pd
import time
import os
session = requests.session()
page = 1500
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Origin': 'https://jobs.bytedance.com',
'Referer': f'https://jobs.bytedance.com/experienced/position?keywords=&category=&location=&project=&type=&job_hot_flag=¤t=1&limit={page}'
}
data = {
"portal_entrance": 1
}
url = "https://jobs.bytedance.com/api/v1/csrf/token"
r = session.post(url, headers=headers, data=data)
cookies = session.cookies.get_dict()
url = "https://jobs.bytedance.com/api/v1/search/job/posts"
headers["x-csrf-token"] = unquote(cookies["atsx-csrf-token"])
data = {
"job_category_id_list": [],
"keyword": "",
"limit": page,
"location_code_list": [],
"offset": 0,
"portal_entrance": 1,
"portal_type": 2,
"recruitment_id_list": [],
"subject_id_list": []
}
for i in range(11):
print(f"准备爬取第{i}页")
data["offset"] = i*page
r = None
while not r:
try:
r = session.post(url, headers=headers,
data=json.dumps(data), timeout=3)
except Exception as e:
print("访问超时!等待5s", e)
time.sleep(5)
df = pd.DataFrame(r.json()['data']['job_post_list'])
if df.shape[0] == 0:
print("爬取完毕!!!")
break
df.city_info = df.city_info.str['name']
df.recruit_type = df.recruit_type.str['parent'].str['name']
tmp = []
for x in df.job_category.values:
if x['parent']:
tmp.append(f"{x['parent']['name']}-{x['name']}")
else:
tmp.append(x['name'])
df.job_category = tmp
df.publish_time = df.publish_time.apply(
lambda x: pd.Timestamp(x, unit="ms"))
df.drop(columns=['sub_title', 'job_hot_flag', 'job_subject'], inplace=True)
df.to_csv("bytedance_jobs.csv", mode="a", header=not os.path.exists("bytedance_jobs.csv"), index=False)
print(",".join(df.title.head(10)))
# 对结果去重
df = pd.read_csv("bytedance_jobs.csv")
df.drop_duplicates(inplace=True)
df.to_csv("bytedance_jobs.csv", index=False)
print("共爬取", df.shape[0], "行无重复数据")
import pandas as pd
df = pd.read_csv("bytedance_jobs.csv")
df
补充资料
CSRF的含义
CSRF的攻击原理
<img src="http://blog.example/admin/add?title=crsf&body=hack" />
防范CSRF攻击的方法
一些问题的解释
r = session.post(url, headers=headers, data=json.dumps(data))
而不是直接使用r = session.post(url, headers=headers, data=data)
的原因是字节跳动nginx服务器json文本校验的原因,requests库内部将字典对象转为json文本的结果无法被nginx解析,但直接使用json库将字典对象转换成功的json文本却可以被nginx服务器解析通过(不信可以自己尝试)。回复下方 「关键词」,获取优质资源
回复关键词 「linux」,即可获取 185 页 Linux 工具快速教程手册和154页的Linux笔记。
回复关键词 「Python进阶」,即可获取 106 页 Python 进阶文档 PDF
回复关键词 「Python面试题」,即可获取最新 100道 面试题 PDF
回复关键词 「python数据分析」,即可获取47页python数据分析与自然语言处理的 PDF
回复关键词 「python爬虫」,满满五份PPT爬虫教程和70多个案例
回复关键词 「Python最强基础学习文档」,即可获取 168 页 Python 最强基础学习文档 PDF,让你快速入门Python 推荐我的微信号
来围观我的朋友圈,我的经验分享,技术更新,不定期送书,坑位有限,速速扫码添加!
备注:开发方向_昵称_城市,另送你10本Python电子书点个在看你最好看