1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
| from bs4 import BeautifulSoup import re, requests import os import wave import requests import time import base64
base_url = "https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=%s&client_secret=%s" APIKey = "***" SecretKey = "***"
HOST = base_url % (APIKey, SecretKey)
def getToken(host): res = requests.post(host) return res.json()['access_token']
def get_audio(file): with open(file, 'rb') as f: data = f.read() return data
def speech2text(FILEPATH, dev_pid=1737): token = getToken(HOST) speech_data = get_audio(FILEPATH)
FORMAT = 'wav' RATE = '16000' CHANNEL = 1 CUID = '*******' SPEECH = base64.b64encode(speech_data).decode('utf-8')
data = { 'format': FORMAT, 'rate': RATE, 'channel': CHANNEL, 'cuid': CUID, 'len': len(speech_data), 'speech': SPEECH, 'token': token, 'dev_pid':dev_pid } url = 'https://vop.baidu.com/server_api' headers = {'Content-Type': 'application/json'} r = requests.post(url, json=data, headers=headers) Result = r.json() if 'result' in Result: return Result['result'][0] else: return Result
def main(): url = "index.html" url_voice2text = "https://app.xunjiepdf.com/voice2text/" url_translate = "https://translate.google.cn/" soup = BeautifulSoup(open(url, encoding='utf-8')) allres = soup.find_all(name='div',attrs={"class":"itest-hear-reslist"}) counter = 1 for res in allres: st = res.span.text; st = str(st)[1:-1] st = st.split(',') for est in st: if(re.match(r'.*\.mp3.*', est)): if(re.match(r'.*question\.mp3+.*', est)): continue print(est[1:-1]) download_addr = est[1:-1] print('Downloading...') f=requests.get(download_addr) with open(str(counter)+".mp3","wb") as code: code.write(f.content) print('Downloaded') print('Transcoding mp3 to pcm...') os.system("ffmpeg -y -i " + str(counter) + ".mp3 -acodec pcm_s16le -f s16le -ac 1 -ar 16000 " + str(counter) + ".pcm" ) print('Transcoded mp3 to pcm') print('Recognizing...') result = speech2text(FILEPATH=str(counter)+".pcm") print('-'*48) print(result) with open(str(counter)+".txt", "w") as ff: print(result, file=ff) print('-'*48) print('Recognized') counter += 1 print('Finished')
if __name__ == '__main__': main()