1
0
mirror of https://github.com/thedaviddelta/lingva-scraper.git synced 2025-10-05 15:52:40 +02:00

Implemented TTS scraping

This commit is contained in:
David
2022-06-01 20:00:28 +02:00
parent 3e93c09896
commit a93a8303d6
7 changed files with 289 additions and 1 deletions

13
src/audio.ts Normal file
View File

@@ -0,0 +1,13 @@
import { mapGoogleCode, LanguageType, LangCode } from "./utils/language";
import request, { Endpoint } from "./utils/request";
export const getAudio = (lang: LangCode, text: string) => {
const parsedLang = mapGoogleCode(LanguageType.TARGET, lang);
const lastSpace = text.lastIndexOf(" ", 200);
const slicedText = text.slice(0, text.length > 200 && lastSpace !== -1 ? lastSpace : 200);
return request(Endpoint.AUDIO)
.with({ lang: parsedLang, text: slicedText })
.doing(({ data }) => data ? Array.from(new Uint8Array(data)) : null);
};

2
src/index.ts Normal file
View File

@@ -0,0 +1,2 @@
export * from "./audio";
export * from "./utils/language";

40
src/utils/language.ts Normal file
View File

@@ -0,0 +1,40 @@
import { languages, exceptions, mappings } from "./languages.json";
const isKeyOf = <T extends object>(obj: T) => (key: keyof any): key is keyof T => key in obj;
export const LanguageType = {
SOURCE: "source",
TARGET: "target"
} as const;
export type LangCode = keyof typeof languages;
export type LangType = typeof LanguageType[keyof typeof LanguageType];
const checkAndChangeCode = (
blacklists: typeof mappings | typeof exceptions,
langType: LangType,
langCode: LangCode
): LangCode => {
const finalBlacklist = blacklists[langType];
return isKeyOf(finalBlacklist)(langCode)
? finalBlacklist[langCode]
: langCode;
};
export const mapGoogleCode = (langType: LangType, langCode: LangCode) => checkAndChangeCode(mappings, langType, langCode);
export const replaceExceptedCode = (langType: LangType, langCode: LangCode) => checkAndChangeCode(exceptions, langType, langCode);
const filteredLanguages = (type: LangType) => {
const entries = Object.entries(languages) as [LangCode, string][];
const filteredEntries = entries.filter(([code]) => (
!Object.keys(exceptions[type]).includes(code)
));
return Object.fromEntries(filteredEntries) as typeof languages;
}
export const languageList = {
all: languages,
source: filteredLanguages(LanguageType.SOURCE),
target: filteredLanguages(LanguageType.TARGET),
};

130
src/utils/languages.json Normal file
View File

@@ -0,0 +1,130 @@
{
"languages": {
"auto": "Detect",
"af": "Afrikaans",
"sq": "Albanian",
"am": "Amharic",
"ar": "Arabic",
"hy": "Armenian",
"az": "Azerbaijani",
"eu": "Basque",
"be": "Belarusian",
"bn": "Bengali",
"bs": "Bosnian",
"bg": "Bulgarian",
"ca": "Catalan",
"ceb": "Cebuano",
"ny": "Chichewa",
"zh": "Chinese",
"zh_HANT": "Chinese (Traditional)",
"co": "Corsican",
"hr": "Croatian",
"cs": "Czech",
"da": "Danish",
"nl": "Dutch",
"en": "English",
"eo": "Esperanto",
"et": "Estonian",
"tl": "Filipino",
"fi": "Finnish",
"fr": "French",
"fy": "Frisian",
"gl": "Galician",
"ka": "Georgian",
"de": "German",
"el": "Greek",
"gu": "Gujarati",
"ht": "Haitian Creole",
"ha": "Hausa",
"haw": "Hawaiian",
"iw": "Hebrew",
"hi": "Hindi",
"hmn": "Hmong",
"hu": "Hungarian",
"is": "Icelandic",
"ig": "Igbo",
"id": "Indonesian",
"ga": "Irish",
"it": "Italian",
"ja": "Japanese",
"jw": "Javanese",
"kn": "Kannada",
"kk": "Kazakh",
"km": "Khmer",
"rw": "Kinyarwanda",
"ko": "Korean",
"ku": "Kurdish (Kurmanji)",
"ky": "Kyrgyz",
"lo": "Lao",
"la": "Latin",
"lv": "Latvian",
"lt": "Lithuanian",
"lb": "Luxembourgish",
"mk": "Macedonian",
"mg": "Malagasy",
"ms": "Malay",
"ml": "Malayalam",
"mt": "Maltese",
"mi": "Maori",
"mr": "Marathi",
"mn": "Mongolian",
"my": "Myanmar (Burmese)",
"ne": "Nepali",
"no": "Norwegian",
"or": "Odia (Oriya)",
"ps": "Pashto",
"fa": "Persian",
"pl": "Polish",
"pt": "Portuguese",
"pa": "Punjabi",
"ro": "Romanian",
"ru": "Russian",
"sm": "Samoan",
"gd": "Scots Gaelic",
"sr": "Serbian",
"st": "Sesotho",
"sn": "Shona",
"sd": "Sindhi",
"si": "Sinhala",
"sk": "Slovak",
"sl": "Slovenian",
"so": "Somali",
"es": "Spanish",
"su": "Sundanese",
"sw": "Swahili",
"sv": "Swedish",
"tg": "Tajik",
"ta": "Tamil",
"tt": "Tatar",
"te": "Telugu",
"th": "Thai",
"tr": "Turkish",
"tk": "Turkmen",
"uk": "Ukrainian",
"ur": "Urdu",
"ug": "Uyghur",
"uz": "Uzbek",
"vi": "Vietnamese",
"cy": "Welsh",
"xh": "Xhosa",
"yi": "Yiddish",
"yo": "Yoruba",
"zu": "Zulu"
},
"exceptions": {
"source": {
"zh_HANT": "zh"
},
"target": {
"auto": "en"
}
},
"mappings": {
"source": {},
"target": {
"zh": "zh-CN",
"zh_HANT": "zh-TW",
"auto": "en"
}
}
}

78
src/utils/request.ts Normal file
View File

@@ -0,0 +1,78 @@
import axios, { AxiosResponse } from "axios";
import UserAgent from "user-agents";
import { LangCode } from "./language";
export const Endpoint = {
TRANSLATION: "translation",
AUDIO: "audio"
} as const;
type EndpointType = typeof Endpoint[keyof typeof Endpoint];
type Params = {
translation: {
body: string;
};
audio: {
lang: LangCode;
text: string;
};
};
const request = <T extends EndpointType>(
endpoint: T,
retry: number = 0
) => ({
with: (
params: Params[T]
) => {
const promise = endpoint === "translation"
? fetchTranslation(params as Params["translation"])
: fetchAudio(params as Params["audio"]);
return {
promise,
doing: <V>(
callback: (res: AxiosResponse) => V | undefined
): Promise<V | null> => (
promise.then(callback)
.catch(() => undefined)
.then(result =>
isEmpty(result) && retry < 3
? request(endpoint, retry + 1).with(params).doing(callback)
: result ?? null
)
)
}
}
});
const isEmpty = (item: any) => (
!item || (typeof item === "object" && "length" in item && item.length <= 0)
);
const fetchTranslation = ({ body }: Params["translation"]) => (
axios.post(
"https://translate.google.com/_/TranslateWebserverUi/data/batchexecute?rpcids=MkEWBc&rt=c",
body,
{
headers: {
"User-Agent": new UserAgent().toString(),
"Content-Type": "application/x-www-form-urlencoded"
}
}
)
);
const fetchAudio = ({ lang, text }: Params["audio"]) => (
axios.get(
`https://translate.google.com/translate_tts?tl=${lang}&q=${encodeURIComponent(text)}&textlen=${text.length}&client=tw-ob`,
{
responseType: "arraybuffer",
headers: {
"User-Agent": new UserAgent().toString()
}
}
)
);
export default request;

24
tests/audio.test.ts Normal file
View File

@@ -0,0 +1,24 @@
import { getAudio, LangCode } from "../src";
const queries: [LangCode, string][] = [
["es", "hola"],
["ca", "gerd"],
["en", "impression"],
["auto", "impression"],
["zh", "早安"],
["zh_HANT", "早安"]
];
it("returns audio buffer correctly", async () => (
Promise.all(queries.map((query) => getAudio(...query)))
.then(results => results.forEach(audio => {
expect(audio).not.toBeNull();
audio?.forEach(int => expect(int).toEqual(expect.any(Number)));
}))
));
it("returns null on wrong language", async () => (
// @ts-ignore
getAudio("wrong", "impression")
.then(audio => expect(audio).toBeNull())
));

View File

@@ -6,7 +6,8 @@
"rootDir": "./src",
"outDir": "./dist",
"esModuleInterop": true,
"strict": true
"strict": true,
"resolveJsonModule": true
},
"include": [
"src/**/*.ts"