mirror of
https://github.com/thedaviddelta/lingva-scraper.git
synced 2025-10-05 15:52:40 +02:00
Implemented TTS scraping
This commit is contained in:
13
src/audio.ts
Normal file
13
src/audio.ts
Normal file
@@ -0,0 +1,13 @@
|
||||
import { mapGoogleCode, LanguageType, LangCode } from "./utils/language";
|
||||
import request, { Endpoint } from "./utils/request";
|
||||
|
||||
export const getAudio = (lang: LangCode, text: string) => {
|
||||
const parsedLang = mapGoogleCode(LanguageType.TARGET, lang);
|
||||
|
||||
const lastSpace = text.lastIndexOf(" ", 200);
|
||||
const slicedText = text.slice(0, text.length > 200 && lastSpace !== -1 ? lastSpace : 200);
|
||||
|
||||
return request(Endpoint.AUDIO)
|
||||
.with({ lang: parsedLang, text: slicedText })
|
||||
.doing(({ data }) => data ? Array.from(new Uint8Array(data)) : null);
|
||||
};
|
2
src/index.ts
Normal file
2
src/index.ts
Normal file
@@ -0,0 +1,2 @@
|
||||
export * from "./audio";
|
||||
export * from "./utils/language";
|
40
src/utils/language.ts
Normal file
40
src/utils/language.ts
Normal file
@@ -0,0 +1,40 @@
|
||||
import { languages, exceptions, mappings } from "./languages.json";
|
||||
|
||||
const isKeyOf = <T extends object>(obj: T) => (key: keyof any): key is keyof T => key in obj;
|
||||
|
||||
export const LanguageType = {
|
||||
SOURCE: "source",
|
||||
TARGET: "target"
|
||||
} as const;
|
||||
|
||||
export type LangCode = keyof typeof languages;
|
||||
export type LangType = typeof LanguageType[keyof typeof LanguageType];
|
||||
|
||||
const checkAndChangeCode = (
|
||||
blacklists: typeof mappings | typeof exceptions,
|
||||
langType: LangType,
|
||||
langCode: LangCode
|
||||
): LangCode => {
|
||||
const finalBlacklist = blacklists[langType];
|
||||
return isKeyOf(finalBlacklist)(langCode)
|
||||
? finalBlacklist[langCode]
|
||||
: langCode;
|
||||
};
|
||||
export const mapGoogleCode = (langType: LangType, langCode: LangCode) => checkAndChangeCode(mappings, langType, langCode);
|
||||
export const replaceExceptedCode = (langType: LangType, langCode: LangCode) => checkAndChangeCode(exceptions, langType, langCode);
|
||||
|
||||
const filteredLanguages = (type: LangType) => {
|
||||
const entries = Object.entries(languages) as [LangCode, string][];
|
||||
|
||||
const filteredEntries = entries.filter(([code]) => (
|
||||
!Object.keys(exceptions[type]).includes(code)
|
||||
));
|
||||
|
||||
return Object.fromEntries(filteredEntries) as typeof languages;
|
||||
}
|
||||
|
||||
export const languageList = {
|
||||
all: languages,
|
||||
source: filteredLanguages(LanguageType.SOURCE),
|
||||
target: filteredLanguages(LanguageType.TARGET),
|
||||
};
|
130
src/utils/languages.json
Normal file
130
src/utils/languages.json
Normal file
@@ -0,0 +1,130 @@
|
||||
{
|
||||
"languages": {
|
||||
"auto": "Detect",
|
||||
"af": "Afrikaans",
|
||||
"sq": "Albanian",
|
||||
"am": "Amharic",
|
||||
"ar": "Arabic",
|
||||
"hy": "Armenian",
|
||||
"az": "Azerbaijani",
|
||||
"eu": "Basque",
|
||||
"be": "Belarusian",
|
||||
"bn": "Bengali",
|
||||
"bs": "Bosnian",
|
||||
"bg": "Bulgarian",
|
||||
"ca": "Catalan",
|
||||
"ceb": "Cebuano",
|
||||
"ny": "Chichewa",
|
||||
"zh": "Chinese",
|
||||
"zh_HANT": "Chinese (Traditional)",
|
||||
"co": "Corsican",
|
||||
"hr": "Croatian",
|
||||
"cs": "Czech",
|
||||
"da": "Danish",
|
||||
"nl": "Dutch",
|
||||
"en": "English",
|
||||
"eo": "Esperanto",
|
||||
"et": "Estonian",
|
||||
"tl": "Filipino",
|
||||
"fi": "Finnish",
|
||||
"fr": "French",
|
||||
"fy": "Frisian",
|
||||
"gl": "Galician",
|
||||
"ka": "Georgian",
|
||||
"de": "German",
|
||||
"el": "Greek",
|
||||
"gu": "Gujarati",
|
||||
"ht": "Haitian Creole",
|
||||
"ha": "Hausa",
|
||||
"haw": "Hawaiian",
|
||||
"iw": "Hebrew",
|
||||
"hi": "Hindi",
|
||||
"hmn": "Hmong",
|
||||
"hu": "Hungarian",
|
||||
"is": "Icelandic",
|
||||
"ig": "Igbo",
|
||||
"id": "Indonesian",
|
||||
"ga": "Irish",
|
||||
"it": "Italian",
|
||||
"ja": "Japanese",
|
||||
"jw": "Javanese",
|
||||
"kn": "Kannada",
|
||||
"kk": "Kazakh",
|
||||
"km": "Khmer",
|
||||
"rw": "Kinyarwanda",
|
||||
"ko": "Korean",
|
||||
"ku": "Kurdish (Kurmanji)",
|
||||
"ky": "Kyrgyz",
|
||||
"lo": "Lao",
|
||||
"la": "Latin",
|
||||
"lv": "Latvian",
|
||||
"lt": "Lithuanian",
|
||||
"lb": "Luxembourgish",
|
||||
"mk": "Macedonian",
|
||||
"mg": "Malagasy",
|
||||
"ms": "Malay",
|
||||
"ml": "Malayalam",
|
||||
"mt": "Maltese",
|
||||
"mi": "Maori",
|
||||
"mr": "Marathi",
|
||||
"mn": "Mongolian",
|
||||
"my": "Myanmar (Burmese)",
|
||||
"ne": "Nepali",
|
||||
"no": "Norwegian",
|
||||
"or": "Odia (Oriya)",
|
||||
"ps": "Pashto",
|
||||
"fa": "Persian",
|
||||
"pl": "Polish",
|
||||
"pt": "Portuguese",
|
||||
"pa": "Punjabi",
|
||||
"ro": "Romanian",
|
||||
"ru": "Russian",
|
||||
"sm": "Samoan",
|
||||
"gd": "Scots Gaelic",
|
||||
"sr": "Serbian",
|
||||
"st": "Sesotho",
|
||||
"sn": "Shona",
|
||||
"sd": "Sindhi",
|
||||
"si": "Sinhala",
|
||||
"sk": "Slovak",
|
||||
"sl": "Slovenian",
|
||||
"so": "Somali",
|
||||
"es": "Spanish",
|
||||
"su": "Sundanese",
|
||||
"sw": "Swahili",
|
||||
"sv": "Swedish",
|
||||
"tg": "Tajik",
|
||||
"ta": "Tamil",
|
||||
"tt": "Tatar",
|
||||
"te": "Telugu",
|
||||
"th": "Thai",
|
||||
"tr": "Turkish",
|
||||
"tk": "Turkmen",
|
||||
"uk": "Ukrainian",
|
||||
"ur": "Urdu",
|
||||
"ug": "Uyghur",
|
||||
"uz": "Uzbek",
|
||||
"vi": "Vietnamese",
|
||||
"cy": "Welsh",
|
||||
"xh": "Xhosa",
|
||||
"yi": "Yiddish",
|
||||
"yo": "Yoruba",
|
||||
"zu": "Zulu"
|
||||
},
|
||||
"exceptions": {
|
||||
"source": {
|
||||
"zh_HANT": "zh"
|
||||
},
|
||||
"target": {
|
||||
"auto": "en"
|
||||
}
|
||||
},
|
||||
"mappings": {
|
||||
"source": {},
|
||||
"target": {
|
||||
"zh": "zh-CN",
|
||||
"zh_HANT": "zh-TW",
|
||||
"auto": "en"
|
||||
}
|
||||
}
|
||||
}
|
78
src/utils/request.ts
Normal file
78
src/utils/request.ts
Normal file
@@ -0,0 +1,78 @@
|
||||
import axios, { AxiosResponse } from "axios";
|
||||
import UserAgent from "user-agents";
|
||||
import { LangCode } from "./language";
|
||||
|
||||
export const Endpoint = {
|
||||
TRANSLATION: "translation",
|
||||
AUDIO: "audio"
|
||||
} as const;
|
||||
|
||||
type EndpointType = typeof Endpoint[keyof typeof Endpoint];
|
||||
|
||||
type Params = {
|
||||
translation: {
|
||||
body: string;
|
||||
};
|
||||
audio: {
|
||||
lang: LangCode;
|
||||
text: string;
|
||||
};
|
||||
};
|
||||
|
||||
const request = <T extends EndpointType>(
|
||||
endpoint: T,
|
||||
retry: number = 0
|
||||
) => ({
|
||||
with: (
|
||||
params: Params[T]
|
||||
) => {
|
||||
const promise = endpoint === "translation"
|
||||
? fetchTranslation(params as Params["translation"])
|
||||
: fetchAudio(params as Params["audio"]);
|
||||
return {
|
||||
promise,
|
||||
doing: <V>(
|
||||
callback: (res: AxiosResponse) => V | undefined
|
||||
): Promise<V | null> => (
|
||||
promise.then(callback)
|
||||
.catch(() => undefined)
|
||||
.then(result =>
|
||||
isEmpty(result) && retry < 3
|
||||
? request(endpoint, retry + 1).with(params).doing(callback)
|
||||
: result ?? null
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const isEmpty = (item: any) => (
|
||||
!item || (typeof item === "object" && "length" in item && item.length <= 0)
|
||||
);
|
||||
|
||||
const fetchTranslation = ({ body }: Params["translation"]) => (
|
||||
axios.post(
|
||||
"https://translate.google.com/_/TranslateWebserverUi/data/batchexecute?rpcids=MkEWBc&rt=c",
|
||||
body,
|
||||
{
|
||||
headers: {
|
||||
"User-Agent": new UserAgent().toString(),
|
||||
"Content-Type": "application/x-www-form-urlencoded"
|
||||
}
|
||||
}
|
||||
)
|
||||
);
|
||||
|
||||
const fetchAudio = ({ lang, text }: Params["audio"]) => (
|
||||
axios.get(
|
||||
`https://translate.google.com/translate_tts?tl=${lang}&q=${encodeURIComponent(text)}&textlen=${text.length}&client=tw-ob`,
|
||||
{
|
||||
responseType: "arraybuffer",
|
||||
headers: {
|
||||
"User-Agent": new UserAgent().toString()
|
||||
}
|
||||
}
|
||||
)
|
||||
);
|
||||
|
||||
export default request;
|
24
tests/audio.test.ts
Normal file
24
tests/audio.test.ts
Normal file
@@ -0,0 +1,24 @@
|
||||
import { getAudio, LangCode } from "../src";
|
||||
|
||||
const queries: [LangCode, string][] = [
|
||||
["es", "hola"],
|
||||
["ca", "gerd"],
|
||||
["en", "impression"],
|
||||
["auto", "impression"],
|
||||
["zh", "早安"],
|
||||
["zh_HANT", "早安"]
|
||||
];
|
||||
|
||||
it("returns audio buffer correctly", async () => (
|
||||
Promise.all(queries.map((query) => getAudio(...query)))
|
||||
.then(results => results.forEach(audio => {
|
||||
expect(audio).not.toBeNull();
|
||||
audio?.forEach(int => expect(int).toEqual(expect.any(Number)));
|
||||
}))
|
||||
));
|
||||
|
||||
it("returns null on wrong language", async () => (
|
||||
// @ts-ignore
|
||||
getAudio("wrong", "impression")
|
||||
.then(audio => expect(audio).toBeNull())
|
||||
));
|
@@ -6,7 +6,8 @@
|
||||
"rootDir": "./src",
|
||||
"outDir": "./dist",
|
||||
"esModuleInterop": true,
|
||||
"strict": true
|
||||
"strict": true,
|
||||
"resolveJsonModule": true
|
||||
},
|
||||
"include": [
|
||||
"src/**/*.ts"
|
||||
|
Reference in New Issue
Block a user