主要的的核心思想是,通过快捷键启动chrome浏览器,然后通过chrome的插件Surfingkeys来实现纯键盘操作页面元素(绕过谷歌的爬虫检测),然后将谷歌验证码切换到语音验证,下载语音文件通过谷歌的语音识别服务来把语音转换成文字,然后输入完成谷歌验证码的破解。解析拿到谷歌验证通过的的tokenRecaptch,后续的请求大概率可以直接通过这个token直接postman请求获取了。
#!/usr/bin/python3
# -*- coding: utf-8 -*-
command ='ctrl'
RECAPTCHA_PAGE_URL = "https://baohiemxahoi.gov.vn/tracuu/Pages/tra-cuu-ho-gia-dinh.aspx"
secs_between_keys=0.5
config = readConfig.ReadConfig()
proxy =config.get_proxy('luminati')
class VnSSChrome(ISpiderChrome):
def __init__(self, task_id):
ISpiderChrome.__init__(self, task_id)
self.op_type = "vnss"
@ChromeRetry(max_retry_count=2)
def Chromestart(self,data):
#打开谷歌浏览器
pyautogui.PAUSE = random.randint(5, 10) * 0.1
super().ChromeStart(data)
task_id = self.task_id
logger.info('task_id:%s,vnss开始工作', task_id)
ss_html=''
try:
city = data['city']
username = data['username']
idcard = data['idcard']
social_security_infos = []
spider_status = 'SUCCESS'
# pyautogui.typewrite('chrome --no-sandbox --proxy-server='+prxoy +' --incognito' )
pyautogui.hotkey('shift', 'ctrl', 'c')
sleep(1)
pyautogui.typewrite(RECAPTCHA_PAGE_URL)
pyautogui.press('enter')
sleep(15)
refresh_count=1
refresh_status=True
while refresh_status and refresh_count<=2:
logger.info('task_id:%s,检测网站是否打开成功', task_id)
# open_status=waitFor('Tôi không phải là người máy')
open_status=waitFor('Toi khong pha')
if open_status:
logger.info('task_id:%s,网站打开成功', task_id)
refresh_status=False
else:
pyautogui.hotkey(command,'r')
refresh_count+=1
if refresh_status:
# logger.error('task_id:%s,网站打开失败',task_id)
raise Exception('task_id:'+task_id+'网站打开失败')
# 谷歌点击识别
# pyautogui.typewrite('w')
pyautogui.hotkey('shift', 'tab')
# pyautogui.press('tab')
pyautogui.press('enter')
sleep(2)
# 判断是否一次识别成功 Chọn tất cả hình ảnh có
# check1 = check_str_bychrome('XÁC MINH')
check1 = check_str_bychrome('Chọn tat ca')
# if not check1:
# check1 = check_str_bychrome('Bỏ qua')
retry_flage=False
retry_count = 2
if check1:
google_pass =False
#选择语音识别
# pyautogui.typewrite(['f', 'd', 'w'], interval=secs_between_keys)
logger.info('task_id:%s,开始语音识别',task_id)
pyautogui.hotkey('shift', 'tab')
pyautogui.hotkey('shift', 'tab')
pyautogui.hotkey('shift', 'tab')
# pyautogui.press('esc')
# pyautogui.press('enter')
pyautogui.press('enter')
sleep(2)
#Máy tính hoặc mạng của bạn có thể đang gửi yêu cầu tự động
check_auto = check_str_bychrome('Nhấn PHÁT và nhập các từ bạn nghe thấy')
# check_auto = check_str_bychrome('Máy tính hoặc mạng của bạn có thể đang gửi yêu cầu tự động')
if not check_auto:
raise Exception('谷歌人机判断生效,需要更换代理')
pyautogui.press('tab')
google_repcha2(task_id)
#检查语音识别是否通过
check2 = check_str_bychrome('Yêu cầu trả lời nhiều hình ảnh xác thực chính xác')
if check2:
retry_flage = True
logger.info('task_id:%s,语音识别失败', task_id)
else:
logger.info('task_id:%s,语音识别成功', task_id)
while retry_flage and retry_count<=5:
#pyautogui.typewrite(['f', 'd', 'w'], interval=secs_between_keys)
#刷新语音识别
logger.info('task_id:%s,开始第%s次语音识别', task_id,retry_count)
# pyautogui.typewrite(['g', 'i'], interval=secs_between_keys)
pyautogui.press('tab')
pyautogui.press('tab')
pyautogui.press('tab')
pyautogui.press('tab')
pyautogui.press('enter')
google_repcha2(task_id)
check3 = check_str_bychrome('Yêu cầu trả lời nhiều hình ảnh xác thực chính xác')
if not check3:
logger.info('task_id:%s,第%s次语音识别成功', task_id, retry_count)
retry_flage = False
retry_count+=1
else:
# google_pass=True
logger.info('task_id:%s,谷歌认证直接通过!', task_id)
if retry_count>5:
raise Exception('谷歌验证码尝试次数超过5次')
# pyautogui.typewrite('w')
#复制网页源代码
pyautogui.hotkey('shift', 'tab')
pyautogui.typewrite(['y', 's'], interval=secs_between_keys)
recaptcha_data=pyperclip.paste()
#分析页面
recaptcha_html = BeautifulSoup(recaptcha_data, "lxml")
recaptcha_token = recaptcha_html.find(id='tokenRecaptch').get('value')
# if google_pass:
# recaptcha_token = recaptcha_html.find(id='tokenRecaptch').get('value')
# else:
# recaptcha_token = recaptcha_html.find(id='recaptcha-token').get('value')
if not recaptcha_token:
logger.error('task_id:%s,谷歌token查询失败:%s', task_id, recaptcha_html)
raise Exception('谷歌token查询失败查询失败')
proxies = get_luminatiproxy(country='vn_luminati')
logger.info('tokenRecaptch:%s',recaptcha_token)
headers = {
"Accept": "*/*","Origin":"https://baohiemxahoi.gov.vn", "Host":"baohiemxahoi.gov.vn","Referer":"https://baohiemxahoi.gov.vn/tracuu/Pages/tra-cuu-ho-gia-dinh.aspx",
# "Sec-Fetch-Mode":"cors","Sec-Fetch-Site":"same-origin",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:69.0) Gecko/20100101 Firefox/69.0"}
data = {'matinh': city, 'tennhankhau': username, 'cmnd': idcard,'mahuyen':'','maxa':'','mathon':'','macd':'','ngaysinh':'','namsinh':'',
'tokenRecaptch': recaptcha_token,
'typetext': 'CoDau'}
ss_data = requests.post(
'https://baohiemxahoi.gov.vn/UserControls/BHXH/BaoHiemYTe/HienThiHoGiaDinh/pListKoOTP.aspx', data=data,headers=headers,
proxies=proxies,timeout=60,verify=False)
ss_html = BeautifulSoup(ss_data.text, "lxml")
no_data = ss_html.find_all(text='Không có kết quả cần tìm')
ss_list = ss_html.find(id='contentChiTietHGD')
if not no_data:
for ss_tr in ss_list.find_all('tr'):
ss_tds = ss_tr.find_all('td')
social_no = ss_tds[1].text
username = ss_tds[2].text
gender = ss_tds[3].text
birthday = ss_tds[4].text
birthday = date_to_china(birthday, '/', '-')
family_cdoe = ss_tds[5].text
address = ss_tds[6].text
status = ss_tds[7].text
social_info = {'social_no': social_no, 'username': username, 'gender': gender, 'birthday': birthday,
'family_cdoe': family_cdoe, 'address': address, 'status': status}
social_security_infos.append(social_info)
self.ChromeFinish(social_security_infos,spider_status)
# notify_spider_from_chrome(self.op_type, task_id, social_security_infos, spider_status)
logger.info('vnss爬取成功:%s', social_security_infos)
except Exception as e:
error_path=logPath+'/'+task_id+".png"
pyautogui.screenshot().save(error_path)
logger.info('task_id:%s,html_msg:%s',task_id,ss_html)
try:
logger.error('spider-vnss-chrome,task_id:%s,爬取异常,截屏文件:%s,异常信息%s,html_msg:%s',task_id,error_path,traceback.format_exc(),ss_html)
except:
pass
raise e
finally:
#关闭浏览器
pyautogui.hotkey(command,'w')
pyautogui.hotkey(command, 'w')
return social_security_infos
def trans_mp3_to_wav(filepath):
song = AudioSegment.from_mp3(filepath)
song.export("now.wav", format="wav")
def google_repcha2(task_id):
# pyautogui.typewrite(['g', 'i'], interval=secs_between_keys)
pyautogui.press('tab')
pyautogui.press('tab')
pyautogui.press('enter')
sleep(5)
# pyautogui.press('tab')
# pyautogui.hotkey('shift', 'tab')
pyautogui.hotkey(command, 'l')
pyautogui.hotkey(command, 'c')
audo_url = pyperclip.paste()
# pyautogui.typewrite(['g', '0'], interval=secs_between_keys)
#关闭语音页面
# pyautogui.typewrite('x')
pyautogui.hotkey(command, 'w')
logger.info(audo_url)
mp3_filename =task_id+".mp3"
wav_filename =task_id+".wav"
if 'mp3' in audo_url:
request = requests.get(audo_url,timeout=30)
text = request.content
with open(mp3_filename, "wb") as f:
f.write(text)
f.close()
# trans_mp3_to_wav(mp3_filename)
song = AudioSegment.from_mp3(mp3_filename)
song.export(wav_filename, format="wav")
code = run_quickstart(wav_filename)
os.remove(mp3_filename)
os.remove(wav_filename)
# pyautogui.hotkey('shift', 'tab')
# pyautogui.typewrite(['f', 'f', 's'], interval=secs_between_keys)
pyautogui.typewrite(['g', 'i'], interval=secs_between_keys)
pyautogui.typewrite(code)
pyautogui.press('tab')
pyautogui.press('tab')
pyautogui.press('tab')
pyautogui.press('tab')
pyautogui.press('tab')
pyautogui.press('enter')
# pyautoguigui.press(['tab','tab','tab','tab','tab'])
# pyautogui.typewrite(['f', 'w'], interval=secs_between_keys)
else:
raise Exception('task_id:%s,打开语音链接失败',task_id)
def check_picture(file_path):
check_data = pyautogui.locateOnScreen(file_path,grayscale=False)
if check_data:
return True
else:
return False
def click_picture(file_path):
x, y=pyautogui.locateCenterOnScreen(file_path)
pyautogui.click(x,y)
def check_str_bypyautogui(str):
pyperclip.copy(str)
pyautogui.typewrite('/')
pyautogui.hotkey(command, 'v')
pyautogui.press('enter')
pyautogui.typewrite('v')
pyautogui.hotkey('shift', '$')
pyperclip.copy('this is a example')
pyautogui.hotkey(command,'c')
pyautogui.press(['esc','esc'])
if str in pyperclip.paste():
return True
else:
return False
def check_str_bychrome(str):
pyperclip.copy(str)
pyautogui.hotkey(command,'f')
pyautogui.hotkey(command, 'v')
pyautogui.press('enter')
# pyautogui.press('enter')
if command =='ctrl':
pyautogui.press('tab')
pyautogui.press('tab')
pyautogui.press('tab')
pyautogui.press('enter')
else:
pyautogui.press('esc')
pyperclip.copy('this is a example')
pyautogui.hotkey(command,'c')
if unidecode(pyperclip.paste().upper()) == unidecode(str.upper()):
return True
else:
return False
def waitFor(string):
check_result=check_str_bychrome(string)
numWaitedFor = 0
while not check_result:
sleep(5)
check_result = check_str_bychrome(string)
numWaitedFor += 1
if numWaitedFor > 2:
return False
return True
if __name__ == "__main__":
this_spider = VnSSChrome('vnsstest')
data=this_spider.Chromestart({'city':'79TTT','username':'Ngô Mai Thanh Trà1x','idcard':'024744790'})
print(data)
版权声明:本文为zhangpz19871210原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。