srt_file_translator

  1import codecs, type_enforced, re
  2from google.cloud import translate_v2 as translate
  3
  4
  5@type_enforced.Enforcer
  6class SRT_Utils:
  7    def parse_srt(
  8        self, filepath: str, statement_delimiters: list = [".", "?", "!"]
  9    ):
 10        """
 11        Parses an SRT file into a dictionary of statements.
 12        The keys of the dictionary are the time stamps of the statements.
 13        The values of the dictionary are the statements themselves.
 14        Statements that are split across multiple lines are aggregated.
 15
 16        Arguments:
 17
 18        * **`filepath`**: `[str]` → The path to the SRT file to be parsed.
 19        * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`.
 20        """
 21        time_structure = re.compile(
 22            "\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}"
 23        )
 24
 25        last_time = "00:00:00,000 --> 00:00:00,000"
 26        srt_data = {}
 27
 28        with open(filepath) as filedata:
 29            for line in filedata:
 30                line_data = line[:-1]
 31                if time_structure.match(line_data) is not None:
 32                    last_time = line_data
 33                    srt_data[last_time] = []
 34                else:
 35                    if last_time not in srt_data:
 36                        srt_data[last_time] = []
 37                    srt_data[last_time].append(line_data)
 38        for key, value in srt_data.items():
 39            srt_data[key] = " ".join(value[:-1] + [""]).strip()
 40
 41        srt_data = self.aggregate_statements(
 42            srt_data=srt_data, statement_delimiters=statement_delimiters
 43        )
 44        return srt_data
 45
 46    def aggregate_statements(self, srt_data: dict, statement_delimiters: list):
 47        """
 48        Takes in a dictionary of SRT data and aggregates statements that are split across multiple lines.
 49        Items are aggregated until a statement delimiter is found at the end of a line.
 50
 51        Arguments:
 52
 53        * **`srt_data`**: `[dict]` → The parsed SRT data.
 54        * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`.
 55
 56        Returns:
 57
 58        * **`out_data`**: `[dict]` → The aggregated SRT data.
 59
 60        EG:
 61
 62        ```python
 63
 64        srt_data = {
 65            "00:00:00,000 --> 00:00:01,000": "Hello World!",
 66            "00:00:01,000 --> 00:00:02,000": "This is",
 67            "00:00:02,000 --> 00:00:03,000": "a test."
 68        }
 69
 70        Translator.aggregate_statements(srt_data=srt_data, statement_delimiters=[".", "?", "!"])
 71        #=> {
 72        #=>     "00:00:00,000 --> 00:00:01,000": "Hello World!",
 73        #=>     "00:00:01,000 --> 00:00:03,000": "This is a test."
 74        #=> }
 75        """
 76        data = []
 77        for key, value in srt_data.items():
 78            data.append(
 79                {
 80                    "start": key.split(" --> ")[0],
 81                    "end": key.split(" --> ")[1],
 82                    "string": value,
 83                }
 84            )
 85        merged_data = []
 86        for idx, item in enumerate(data):
 87            if len(item["string"]) == 0:
 88                continue
 89            if (
 90                item["string"][-1] in statement_delimiters
 91                or idx == len(data) - 1
 92            ):
 93                merged_data.append(item)
 94            else:
 95                data[idx + 1]["string"] = (
 96                    item["string"] + " " + data[idx + 1]["string"]
 97                )
 98                data[idx + 1]["start"] = item["start"]
 99        out_data = {}
100        for item in merged_data:
101            out_data[item["start"] + " --> " + item["end"]] = item[
102                "string"
103            ].strip()
104        return out_data
105
106    def write_srt(self, filepath: str, srt_data: dict):
107        """
108        Writes SRT data to a file.
109
110        Arguments:
111
112        * **`filepath`**: `[str]` → The path to the SRT file to be written.
113        * **`srt_data`**: `[dict]` → The SRT data to be written to the file.
114        """
115        idx = 0
116        with codecs.open(filepath, "w+", encoding="utf-8-sig") as out_file:
117            for key, value in srt_data.items():
118                out_file.write(str(idx) + "\n")
119                out_file.write(key + "\n")
120                out_file.write(value + "\n")
121                out_file.write("\n")
122                idx += 1
123
124
125@type_enforced.Enforcer
126class Translator(SRT_Utils):
127    def __init__(self, key_path: str):
128        """
129        Initializes the Translator class.
130
131        Arguments:
132
133        * **`key_path`**: `[str]` → The path to the Google Cloud API key.
134            * You can create a key by following the instructions [here](https://cloud.google.com/translate/docs/setup).
135        """
136        self.__client__ = translate.Client.from_service_account_json(key_path)
137        self.__languages__ = self.__client__.get_languages()
138        self.__available_languages__ = set(
139            [language["language"] for language in self.__languages__]
140        )
141
142    def show_languages(self):
143        """
144        Prints a list of available languages.
145        """
146        for language in self.__languages__:
147            print("{name} ({language})".format(**language))
148
149    def translate(self, text: str, source_language: str, target_language: str):
150        """
151        Translates a string of text from one language to another.
152
153        Arguments:
154
155        * **`text`**: `[str]` → The text to be translated.
156        * **`source_language`**: `[str]` → The language of the text to be translated.
157        * **`target_language`**: `[str]` → The language to translate the text to.
158        """
159
160        return self.__client__.translate(
161            text,
162            target_language=target_language,
163            source_language=source_language,
164        )
165
166    def srt_file_translator(
167        self,
168        source_file: str,
169        target_file: str,
170        source_language: str,
171        target_language: str,
172        statement_delimiters: list = [".", "?", "!"],
173    ):
174        """
175        Reads an SRT file, translates the text, and writes the translated text to a new SRT file.
176
177        Arguments:
178
179        * **`source_file`**: `[str]` → The path to the SRT file to be translated.
180        * **`target_file`**: `[str]` → The path to the SRT file to be written.
181        * **`source_language`**: `[str]` → The language of the text to be translated.
182        * **`target_language`**: `[str]` → The language to translate the text to.
183        * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`.
184        """
185        # General Assertions
186        assert (
187            source_language in self.__available_languages__
188        ), "Source language not supported. Use Translator.show_languages() to see available languages."
189        assert (
190            target_language in self.__available_languages__
191        ), "Target language not supported. Use Translator.show_languages() to see available languages."
192        assert source_file.endswith(".srt"), "Source file must be a .srt file"
193        assert target_file.endswith(".srt"), "Target file must be a .srt file"
194
195        # Parse SRT
196        srt_data = self.parse_srt(
197            filepath=source_file, statement_delimiters=statement_delimiters
198        )
199
200        # Chunk SRT Data into 128 item chunks
201        srt_data_values = list(srt_data.values())
202        chunked_values = [
203            srt_data_values[i : i + 128]
204            for i in range(0, len(srt_data_values), 128)
205        ]
206        translations = []
207        for chunk in chunked_values:
208            translations += [
209                i["translatedText"]
210                for i in self.__client__.translate(
211                    chunk,
212                    target_language=target_language,
213                    source_language=source_language,
214                )
215            ]
216        output_srt_data = dict(zip(srt_data.keys(), translations))
217        self.write_srt(filepath=target_file, srt_data=output_srt_data)
@type_enforced.Enforcer
class SRT_Utils:
  6@type_enforced.Enforcer
  7class SRT_Utils:
  8    def parse_srt(
  9        self, filepath: str, statement_delimiters: list = [".", "?", "!"]
 10    ):
 11        """
 12        Parses an SRT file into a dictionary of statements.
 13        The keys of the dictionary are the time stamps of the statements.
 14        The values of the dictionary are the statements themselves.
 15        Statements that are split across multiple lines are aggregated.
 16
 17        Arguments:
 18
 19        * **`filepath`**: `[str]` → The path to the SRT file to be parsed.
 20        * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`.
 21        """
 22        time_structure = re.compile(
 23            "\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}"
 24        )
 25
 26        last_time = "00:00:00,000 --> 00:00:00,000"
 27        srt_data = {}
 28
 29        with open(filepath) as filedata:
 30            for line in filedata:
 31                line_data = line[:-1]
 32                if time_structure.match(line_data) is not None:
 33                    last_time = line_data
 34                    srt_data[last_time] = []
 35                else:
 36                    if last_time not in srt_data:
 37                        srt_data[last_time] = []
 38                    srt_data[last_time].append(line_data)
 39        for key, value in srt_data.items():
 40            srt_data[key] = " ".join(value[:-1] + [""]).strip()
 41
 42        srt_data = self.aggregate_statements(
 43            srt_data=srt_data, statement_delimiters=statement_delimiters
 44        )
 45        return srt_data
 46
 47    def aggregate_statements(self, srt_data: dict, statement_delimiters: list):
 48        """
 49        Takes in a dictionary of SRT data and aggregates statements that are split across multiple lines.
 50        Items are aggregated until a statement delimiter is found at the end of a line.
 51
 52        Arguments:
 53
 54        * **`srt_data`**: `[dict]` → The parsed SRT data.
 55        * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`.
 56
 57        Returns:
 58
 59        * **`out_data`**: `[dict]` → The aggregated SRT data.
 60
 61        EG:
 62
 63        ```python
 64
 65        srt_data = {
 66            "00:00:00,000 --> 00:00:01,000": "Hello World!",
 67            "00:00:01,000 --> 00:00:02,000": "This is",
 68            "00:00:02,000 --> 00:00:03,000": "a test."
 69        }
 70
 71        Translator.aggregate_statements(srt_data=srt_data, statement_delimiters=[".", "?", "!"])
 72        #=> {
 73        #=>     "00:00:00,000 --> 00:00:01,000": "Hello World!",
 74        #=>     "00:00:01,000 --> 00:00:03,000": "This is a test."
 75        #=> }
 76        """
 77        data = []
 78        for key, value in srt_data.items():
 79            data.append(
 80                {
 81                    "start": key.split(" --> ")[0],
 82                    "end": key.split(" --> ")[1],
 83                    "string": value,
 84                }
 85            )
 86        merged_data = []
 87        for idx, item in enumerate(data):
 88            if len(item["string"]) == 0:
 89                continue
 90            if (
 91                item["string"][-1] in statement_delimiters
 92                or idx == len(data) - 1
 93            ):
 94                merged_data.append(item)
 95            else:
 96                data[idx + 1]["string"] = (
 97                    item["string"] + " " + data[idx + 1]["string"]
 98                )
 99                data[idx + 1]["start"] = item["start"]
100        out_data = {}
101        for item in merged_data:
102            out_data[item["start"] + " --> " + item["end"]] = item[
103                "string"
104            ].strip()
105        return out_data
106
107    def write_srt(self, filepath: str, srt_data: dict):
108        """
109        Writes SRT data to a file.
110
111        Arguments:
112
113        * **`filepath`**: `[str]` → The path to the SRT file to be written.
114        * **`srt_data`**: `[dict]` → The SRT data to be written to the file.
115        """
116        idx = 0
117        with codecs.open(filepath, "w+", encoding="utf-8-sig") as out_file:
118            for key, value in srt_data.items():
119                out_file.write(str(idx) + "\n")
120                out_file.write(key + "\n")
121                out_file.write(value + "\n")
122                out_file.write("\n")
123                idx += 1
def parse_srt(self, filepath: str, statement_delimiters: list = ['.', '?', '!']):
 8    def parse_srt(
 9        self, filepath: str, statement_delimiters: list = [".", "?", "!"]
10    ):
11        """
12        Parses an SRT file into a dictionary of statements.
13        The keys of the dictionary are the time stamps of the statements.
14        The values of the dictionary are the statements themselves.
15        Statements that are split across multiple lines are aggregated.
16
17        Arguments:
18
19        * **`filepath`**: `[str]` → The path to the SRT file to be parsed.
20        * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`.
21        """
22        time_structure = re.compile(
23            "\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}"
24        )
25
26        last_time = "00:00:00,000 --> 00:00:00,000"
27        srt_data = {}
28
29        with open(filepath) as filedata:
30            for line in filedata:
31                line_data = line[:-1]
32                if time_structure.match(line_data) is not None:
33                    last_time = line_data
34                    srt_data[last_time] = []
35                else:
36                    if last_time not in srt_data:
37                        srt_data[last_time] = []
38                    srt_data[last_time].append(line_data)
39        for key, value in srt_data.items():
40            srt_data[key] = " ".join(value[:-1] + [""]).strip()
41
42        srt_data = self.aggregate_statements(
43            srt_data=srt_data, statement_delimiters=statement_delimiters
44        )
45        return srt_data

Parses an SRT file into a dictionary of statements. The keys of the dictionary are the time stamps of the statements. The values of the dictionary are the statements themselves. Statements that are split across multiple lines are aggregated.

Arguments:

  • filepath: [str] → The path to the SRT file to be parsed.
  • statement_delimiters: [list] → A list of characters that indicate the end of a statement. Defaults to [".", "?", "!"].
def aggregate_statements(self, srt_data: dict, statement_delimiters: list):
 47    def aggregate_statements(self, srt_data: dict, statement_delimiters: list):
 48        """
 49        Takes in a dictionary of SRT data and aggregates statements that are split across multiple lines.
 50        Items are aggregated until a statement delimiter is found at the end of a line.
 51
 52        Arguments:
 53
 54        * **`srt_data`**: `[dict]` → The parsed SRT data.
 55        * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`.
 56
 57        Returns:
 58
 59        * **`out_data`**: `[dict]` → The aggregated SRT data.
 60
 61        EG:
 62
 63        ```python
 64
 65        srt_data = {
 66            "00:00:00,000 --> 00:00:01,000": "Hello World!",
 67            "00:00:01,000 --> 00:00:02,000": "This is",
 68            "00:00:02,000 --> 00:00:03,000": "a test."
 69        }
 70
 71        Translator.aggregate_statements(srt_data=srt_data, statement_delimiters=[".", "?", "!"])
 72        #=> {
 73        #=>     "00:00:00,000 --> 00:00:01,000": "Hello World!",
 74        #=>     "00:00:01,000 --> 00:00:03,000": "This is a test."
 75        #=> }
 76        """
 77        data = []
 78        for key, value in srt_data.items():
 79            data.append(
 80                {
 81                    "start": key.split(" --> ")[0],
 82                    "end": key.split(" --> ")[1],
 83                    "string": value,
 84                }
 85            )
 86        merged_data = []
 87        for idx, item in enumerate(data):
 88            if len(item["string"]) == 0:
 89                continue
 90            if (
 91                item["string"][-1] in statement_delimiters
 92                or idx == len(data) - 1
 93            ):
 94                merged_data.append(item)
 95            else:
 96                data[idx + 1]["string"] = (
 97                    item["string"] + " " + data[idx + 1]["string"]
 98                )
 99                data[idx + 1]["start"] = item["start"]
100        out_data = {}
101        for item in merged_data:
102            out_data[item["start"] + " --> " + item["end"]] = item[
103                "string"
104            ].strip()
105        return out_data

Takes in a dictionary of SRT data and aggregates statements that are split across multiple lines. Items are aggregated until a statement delimiter is found at the end of a line.

Arguments:

  • srt_data: [dict] → The parsed SRT data.
  • statement_delimiters: [list] → A list of characters that indicate the end of a statement. Defaults to [".", "?", "!"].

Returns:

  • out_data: [dict] → The aggregated SRT data.

EG:

```python

srt_data = { "00:00:00,000 --> 00:00:01,000": "Hello World!", "00:00:01,000 --> 00:00:02,000": "This is", "00:00:02,000 --> 00:00:03,000": "a test." }

Translator.aggregate_statements(srt_data=srt_data, statement_delimiters=[".", "?", "!"])

=> {

=> "00:00:00,000 --> 00:00:01,000": "Hello World!",

=> "00:00:01,000 --> 00:00:03,000": "This is a test."

=> }

def write_srt(self, filepath: str, srt_data: dict):
107    def write_srt(self, filepath: str, srt_data: dict):
108        """
109        Writes SRT data to a file.
110
111        Arguments:
112
113        * **`filepath`**: `[str]` → The path to the SRT file to be written.
114        * **`srt_data`**: `[dict]` → The SRT data to be written to the file.
115        """
116        idx = 0
117        with codecs.open(filepath, "w+", encoding="utf-8-sig") as out_file:
118            for key, value in srt_data.items():
119                out_file.write(str(idx) + "\n")
120                out_file.write(key + "\n")
121                out_file.write(value + "\n")
122                out_file.write("\n")
123                idx += 1

Writes SRT data to a file.

Arguments:

  • filepath: [str] → The path to the SRT file to be written.
  • srt_data: [dict] → The SRT data to be written to the file.
@type_enforced.Enforcer
class Translator(SRT_Utils):
126@type_enforced.Enforcer
127class Translator(SRT_Utils):
128    def __init__(self, key_path: str):
129        """
130        Initializes the Translator class.
131
132        Arguments:
133
134        * **`key_path`**: `[str]` → The path to the Google Cloud API key.
135            * You can create a key by following the instructions [here](https://cloud.google.com/translate/docs/setup).
136        """
137        self.__client__ = translate.Client.from_service_account_json(key_path)
138        self.__languages__ = self.__client__.get_languages()
139        self.__available_languages__ = set(
140            [language["language"] for language in self.__languages__]
141        )
142
143    def show_languages(self):
144        """
145        Prints a list of available languages.
146        """
147        for language in self.__languages__:
148            print("{name} ({language})".format(**language))
149
150    def translate(self, text: str, source_language: str, target_language: str):
151        """
152        Translates a string of text from one language to another.
153
154        Arguments:
155
156        * **`text`**: `[str]` → The text to be translated.
157        * **`source_language`**: `[str]` → The language of the text to be translated.
158        * **`target_language`**: `[str]` → The language to translate the text to.
159        """
160
161        return self.__client__.translate(
162            text,
163            target_language=target_language,
164            source_language=source_language,
165        )
166
167    def srt_file_translator(
168        self,
169        source_file: str,
170        target_file: str,
171        source_language: str,
172        target_language: str,
173        statement_delimiters: list = [".", "?", "!"],
174    ):
175        """
176        Reads an SRT file, translates the text, and writes the translated text to a new SRT file.
177
178        Arguments:
179
180        * **`source_file`**: `[str]` → The path to the SRT file to be translated.
181        * **`target_file`**: `[str]` → The path to the SRT file to be written.
182        * **`source_language`**: `[str]` → The language of the text to be translated.
183        * **`target_language`**: `[str]` → The language to translate the text to.
184        * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`.
185        """
186        # General Assertions
187        assert (
188            source_language in self.__available_languages__
189        ), "Source language not supported. Use Translator.show_languages() to see available languages."
190        assert (
191            target_language in self.__available_languages__
192        ), "Target language not supported. Use Translator.show_languages() to see available languages."
193        assert source_file.endswith(".srt"), "Source file must be a .srt file"
194        assert target_file.endswith(".srt"), "Target file must be a .srt file"
195
196        # Parse SRT
197        srt_data = self.parse_srt(
198            filepath=source_file, statement_delimiters=statement_delimiters
199        )
200
201        # Chunk SRT Data into 128 item chunks
202        srt_data_values = list(srt_data.values())
203        chunked_values = [
204            srt_data_values[i : i + 128]
205            for i in range(0, len(srt_data_values), 128)
206        ]
207        translations = []
208        for chunk in chunked_values:
209            translations += [
210                i["translatedText"]
211                for i in self.__client__.translate(
212                    chunk,
213                    target_language=target_language,
214                    source_language=source_language,
215                )
216            ]
217        output_srt_data = dict(zip(srt_data.keys(), translations))
218        self.write_srt(filepath=target_file, srt_data=output_srt_data)
Translator(key_path: str)
128    def __init__(self, key_path: str):
129        """
130        Initializes the Translator class.
131
132        Arguments:
133
134        * **`key_path`**: `[str]` → The path to the Google Cloud API key.
135            * You can create a key by following the instructions [here](https://cloud.google.com/translate/docs/setup).
136        """
137        self.__client__ = translate.Client.from_service_account_json(key_path)
138        self.__languages__ = self.__client__.get_languages()
139        self.__available_languages__ = set(
140            [language["language"] for language in self.__languages__]
141        )

Initializes the Translator class.

Arguments:

  • key_path: [str] → The path to the Google Cloud API key.
    • You can create a key by following the instructions here.
def show_languages(self):
143    def show_languages(self):
144        """
145        Prints a list of available languages.
146        """
147        for language in self.__languages__:
148            print("{name} ({language})".format(**language))

Prints a list of available languages.

def translate(self, text: str, source_language: str, target_language: str):
150    def translate(self, text: str, source_language: str, target_language: str):
151        """
152        Translates a string of text from one language to another.
153
154        Arguments:
155
156        * **`text`**: `[str]` → The text to be translated.
157        * **`source_language`**: `[str]` → The language of the text to be translated.
158        * **`target_language`**: `[str]` → The language to translate the text to.
159        """
160
161        return self.__client__.translate(
162            text,
163            target_language=target_language,
164            source_language=source_language,
165        )

Translates a string of text from one language to another.

Arguments:

  • text: [str] → The text to be translated.
  • source_language: [str] → The language of the text to be translated.
  • target_language: [str] → The language to translate the text to.
def srt_file_translator( self, source_file: str, target_file: str, source_language: str, target_language: str, statement_delimiters: list = ['.', '?', '!']):
167    def srt_file_translator(
168        self,
169        source_file: str,
170        target_file: str,
171        source_language: str,
172        target_language: str,
173        statement_delimiters: list = [".", "?", "!"],
174    ):
175        """
176        Reads an SRT file, translates the text, and writes the translated text to a new SRT file.
177
178        Arguments:
179
180        * **`source_file`**: `[str]` → The path to the SRT file to be translated.
181        * **`target_file`**: `[str]` → The path to the SRT file to be written.
182        * **`source_language`**: `[str]` → The language of the text to be translated.
183        * **`target_language`**: `[str]` → The language to translate the text to.
184        * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`.
185        """
186        # General Assertions
187        assert (
188            source_language in self.__available_languages__
189        ), "Source language not supported. Use Translator.show_languages() to see available languages."
190        assert (
191            target_language in self.__available_languages__
192        ), "Target language not supported. Use Translator.show_languages() to see available languages."
193        assert source_file.endswith(".srt"), "Source file must be a .srt file"
194        assert target_file.endswith(".srt"), "Target file must be a .srt file"
195
196        # Parse SRT
197        srt_data = self.parse_srt(
198            filepath=source_file, statement_delimiters=statement_delimiters
199        )
200
201        # Chunk SRT Data into 128 item chunks
202        srt_data_values = list(srt_data.values())
203        chunked_values = [
204            srt_data_values[i : i + 128]
205            for i in range(0, len(srt_data_values), 128)
206        ]
207        translations = []
208        for chunk in chunked_values:
209            translations += [
210                i["translatedText"]
211                for i in self.__client__.translate(
212                    chunk,
213                    target_language=target_language,
214                    source_language=source_language,
215                )
216            ]
217        output_srt_data = dict(zip(srt_data.keys(), translations))
218        self.write_srt(filepath=target_file, srt_data=output_srt_data)

Reads an SRT file, translates the text, and writes the translated text to a new SRT file.

Arguments:

  • source_file: [str] → The path to the SRT file to be translated.
  • target_file: [str] → The path to the SRT file to be written.
  • source_language: [str] → The language of the text to be translated.
  • target_language: [str] → The language to translate the text to.
  • statement_delimiters: [list] → A list of characters that indicate the end of a statement. Defaults to [".", "?", "!"].