From a93a8303d64b7b5e5de3a961ad195c87fe5d364b Mon Sep 17 00:00:00 2001 From: David Date: Wed, 1 Jun 2022 20:00:28 +0200 Subject: [PATCH] Implemented TTS scraping --- src/audio.ts | 13 ++++ src/index.ts | 2 + src/utils/language.ts | 40 ++++++++++++ src/utils/languages.json | 130 +++++++++++++++++++++++++++++++++++++++ src/utils/request.ts | 78 +++++++++++++++++++++++ tests/audio.test.ts | 24 ++++++++ tsconfig.json | 3 +- 7 files changed, 289 insertions(+), 1 deletion(-) create mode 100644 src/audio.ts create mode 100644 src/index.ts create mode 100644 src/utils/language.ts create mode 100644 src/utils/languages.json create mode 100644 src/utils/request.ts create mode 100644 tests/audio.test.ts diff --git a/src/audio.ts b/src/audio.ts new file mode 100644 index 0000000..d79e548 --- /dev/null +++ b/src/audio.ts @@ -0,0 +1,13 @@ +import { mapGoogleCode, LanguageType, LangCode } from "./utils/language"; +import request, { Endpoint } from "./utils/request"; + +export const getAudio = (lang: LangCode, text: string) => { + const parsedLang = mapGoogleCode(LanguageType.TARGET, lang); + + const lastSpace = text.lastIndexOf(" ", 200); + const slicedText = text.slice(0, text.length > 200 && lastSpace !== -1 ? lastSpace : 200); + + return request(Endpoint.AUDIO) + .with({ lang: parsedLang, text: slicedText }) + .doing(({ data }) => data ? Array.from(new Uint8Array(data)) : null); +}; diff --git a/src/index.ts b/src/index.ts new file mode 100644 index 0000000..73c03e9 --- /dev/null +++ b/src/index.ts @@ -0,0 +1,2 @@ +export * from "./audio"; +export * from "./utils/language"; diff --git a/src/utils/language.ts b/src/utils/language.ts new file mode 100644 index 0000000..eb13e66 --- /dev/null +++ b/src/utils/language.ts @@ -0,0 +1,40 @@ +import { languages, exceptions, mappings } from "./languages.json"; + +const isKeyOf = (obj: T) => (key: keyof any): key is keyof T => key in obj; + +export const LanguageType = { + SOURCE: "source", + TARGET: "target" +} as const; + +export type LangCode = keyof typeof languages; +export type LangType = typeof LanguageType[keyof typeof LanguageType]; + +const checkAndChangeCode = ( + blacklists: typeof mappings | typeof exceptions, + langType: LangType, + langCode: LangCode +): LangCode => { + const finalBlacklist = blacklists[langType]; + return isKeyOf(finalBlacklist)(langCode) + ? finalBlacklist[langCode] + : langCode; +}; +export const mapGoogleCode = (langType: LangType, langCode: LangCode) => checkAndChangeCode(mappings, langType, langCode); +export const replaceExceptedCode = (langType: LangType, langCode: LangCode) => checkAndChangeCode(exceptions, langType, langCode); + +const filteredLanguages = (type: LangType) => { + const entries = Object.entries(languages) as [LangCode, string][]; + + const filteredEntries = entries.filter(([code]) => ( + !Object.keys(exceptions[type]).includes(code) + )); + + return Object.fromEntries(filteredEntries) as typeof languages; +} + +export const languageList = { + all: languages, + source: filteredLanguages(LanguageType.SOURCE), + target: filteredLanguages(LanguageType.TARGET), +}; diff --git a/src/utils/languages.json b/src/utils/languages.json new file mode 100644 index 0000000..b883aa5 --- /dev/null +++ b/src/utils/languages.json @@ -0,0 +1,130 @@ +{ + "languages": { + "auto": "Detect", + "af": "Afrikaans", + "sq": "Albanian", + "am": "Amharic", + "ar": "Arabic", + "hy": "Armenian", + "az": "Azerbaijani", + "eu": "Basque", + "be": "Belarusian", + "bn": "Bengali", + "bs": "Bosnian", + "bg": "Bulgarian", + "ca": "Catalan", + "ceb": "Cebuano", + "ny": "Chichewa", + "zh": "Chinese", + "zh_HANT": "Chinese (Traditional)", + "co": "Corsican", + "hr": "Croatian", + "cs": "Czech", + "da": "Danish", + "nl": "Dutch", + "en": "English", + "eo": "Esperanto", + "et": "Estonian", + "tl": "Filipino", + "fi": "Finnish", + "fr": "French", + "fy": "Frisian", + "gl": "Galician", + "ka": "Georgian", + "de": "German", + "el": "Greek", + "gu": "Gujarati", + "ht": "Haitian Creole", + "ha": "Hausa", + "haw": "Hawaiian", + "iw": "Hebrew", + "hi": "Hindi", + "hmn": "Hmong", + "hu": "Hungarian", + "is": "Icelandic", + "ig": "Igbo", + "id": "Indonesian", + "ga": "Irish", + "it": "Italian", + "ja": "Japanese", + "jw": "Javanese", + "kn": "Kannada", + "kk": "Kazakh", + "km": "Khmer", + "rw": "Kinyarwanda", + "ko": "Korean", + "ku": "Kurdish (Kurmanji)", + "ky": "Kyrgyz", + "lo": "Lao", + "la": "Latin", + "lv": "Latvian", + "lt": "Lithuanian", + "lb": "Luxembourgish", + "mk": "Macedonian", + "mg": "Malagasy", + "ms": "Malay", + "ml": "Malayalam", + "mt": "Maltese", + "mi": "Maori", + "mr": "Marathi", + "mn": "Mongolian", + "my": "Myanmar (Burmese)", + "ne": "Nepali", + "no": "Norwegian", + "or": "Odia (Oriya)", + "ps": "Pashto", + "fa": "Persian", + "pl": "Polish", + "pt": "Portuguese", + "pa": "Punjabi", + "ro": "Romanian", + "ru": "Russian", + "sm": "Samoan", + "gd": "Scots Gaelic", + "sr": "Serbian", + "st": "Sesotho", + "sn": "Shona", + "sd": "Sindhi", + "si": "Sinhala", + "sk": "Slovak", + "sl": "Slovenian", + "so": "Somali", + "es": "Spanish", + "su": "Sundanese", + "sw": "Swahili", + "sv": "Swedish", + "tg": "Tajik", + "ta": "Tamil", + "tt": "Tatar", + "te": "Telugu", + "th": "Thai", + "tr": "Turkish", + "tk": "Turkmen", + "uk": "Ukrainian", + "ur": "Urdu", + "ug": "Uyghur", + "uz": "Uzbek", + "vi": "Vietnamese", + "cy": "Welsh", + "xh": "Xhosa", + "yi": "Yiddish", + "yo": "Yoruba", + "zu": "Zulu" + }, + "exceptions": { + "source": { + "zh_HANT": "zh" + }, + "target": { + "auto": "en" + } + }, + "mappings": { + "source": {}, + "target": { + "zh": "zh-CN", + "zh_HANT": "zh-TW", + "auto": "en" + } + } +} diff --git a/src/utils/request.ts b/src/utils/request.ts new file mode 100644 index 0000000..a2dca32 --- /dev/null +++ b/src/utils/request.ts @@ -0,0 +1,78 @@ +import axios, { AxiosResponse } from "axios"; +import UserAgent from "user-agents"; +import { LangCode } from "./language"; + +export const Endpoint = { + TRANSLATION: "translation", + AUDIO: "audio" +} as const; + +type EndpointType = typeof Endpoint[keyof typeof Endpoint]; + +type Params = { + translation: { + body: string; + }; + audio: { + lang: LangCode; + text: string; + }; +}; + +const request = ( + endpoint: T, + retry: number = 0 +) => ({ + with: ( + params: Params[T] + ) => { + const promise = endpoint === "translation" + ? fetchTranslation(params as Params["translation"]) + : fetchAudio(params as Params["audio"]); + return { + promise, + doing: ( + callback: (res: AxiosResponse) => V | undefined + ): Promise => ( + promise.then(callback) + .catch(() => undefined) + .then(result => + isEmpty(result) && retry < 3 + ? request(endpoint, retry + 1).with(params).doing(callback) + : result ?? null + ) + ) + } + } +}); + +const isEmpty = (item: any) => ( + !item || (typeof item === "object" && "length" in item && item.length <= 0) +); + +const fetchTranslation = ({ body }: Params["translation"]) => ( + axios.post( + "https://translate.google.com/_/TranslateWebserverUi/data/batchexecute?rpcids=MkEWBc&rt=c", + body, + { + headers: { + "User-Agent": new UserAgent().toString(), + "Content-Type": "application/x-www-form-urlencoded" + } + } + ) +); + +const fetchAudio = ({ lang, text }: Params["audio"]) => ( + axios.get( + `https://translate.google.com/translate_tts?tl=${lang}&q=${encodeURIComponent(text)}&textlen=${text.length}&client=tw-ob`, + { + responseType: "arraybuffer", + headers: { + "User-Agent": new UserAgent().toString() + } + } + ) +); + +export default request; diff --git a/tests/audio.test.ts b/tests/audio.test.ts new file mode 100644 index 0000000..63459c6 --- /dev/null +++ b/tests/audio.test.ts @@ -0,0 +1,24 @@ +import { getAudio, LangCode } from "../src"; + +const queries: [LangCode, string][] = [ + ["es", "hola"], + ["ca", "gerd"], + ["en", "impression"], + ["auto", "impression"], + ["zh", "早安"], + ["zh_HANT", "早安"] +]; + +it("returns audio buffer correctly", async () => ( + Promise.all(queries.map((query) => getAudio(...query))) + .then(results => results.forEach(audio => { + expect(audio).not.toBeNull(); + audio?.forEach(int => expect(int).toEqual(expect.any(Number))); + })) +)); + +it("returns null on wrong language", async () => ( + // @ts-ignore + getAudio("wrong", "impression") + .then(audio => expect(audio).toBeNull()) +)); diff --git a/tsconfig.json b/tsconfig.json index 832a3ab..e678339 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -6,7 +6,8 @@ "rootDir": "./src", "outDir": "./dist", "esModuleInterop": true, - "strict": true + "strict": true, + "resolveJsonModule": true }, "include": [ "src/**/*.ts"