srt_file_translator
1import codecs, type_enforced, re 2from google.cloud import translate_v2 as translate 3 4 5@type_enforced.Enforcer 6class SRT_Utils: 7 def parse_srt( 8 self, filepath: str, statement_delimiters: list = [".", "?", "!"] 9 ): 10 """ 11 Parses an SRT file into a dictionary of statements. 12 The keys of the dictionary are the time stamps of the statements. 13 The values of the dictionary are the statements themselves. 14 Statements that are split across multiple lines are aggregated. 15 16 Arguments: 17 18 * **`filepath`**: `[str]` → The path to the SRT file to be parsed. 19 * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`. 20 """ 21 time_structure = re.compile( 22 "\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}" 23 ) 24 25 last_time = "00:00:00,000 --> 00:00:00,000" 26 srt_data = {} 27 28 with open(filepath) as filedata: 29 for line in filedata: 30 line_data = line[:-1] 31 if time_structure.match(line_data) is not None: 32 last_time = line_data 33 srt_data[last_time] = [] 34 else: 35 if last_time not in srt_data: 36 srt_data[last_time] = [] 37 srt_data[last_time].append(line_data) 38 for key, value in srt_data.items(): 39 srt_data[key] = " ".join(value[:-1] + [""]).strip() 40 41 srt_data = self.aggregate_statements( 42 srt_data=srt_data, statement_delimiters=statement_delimiters 43 ) 44 return srt_data 45 46 def aggregate_statements(self, srt_data: dict, statement_delimiters: list): 47 """ 48 Takes in a dictionary of SRT data and aggregates statements that are split across multiple lines. 49 Items are aggregated until a statement delimiter is found at the end of a line. 50 51 Arguments: 52 53 * **`srt_data`**: `[dict]` → The parsed SRT data. 54 * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`. 55 56 Returns: 57 58 * **`out_data`**: `[dict]` → The aggregated SRT data. 59 60 EG: 61 62 ```python 63 64 srt_data = { 65 "00:00:00,000 --> 00:00:01,000": "Hello World!", 66 "00:00:01,000 --> 00:00:02,000": "This is", 67 "00:00:02,000 --> 00:00:03,000": "a test." 68 } 69 70 Translator.aggregate_statements(srt_data=srt_data, statement_delimiters=[".", "?", "!"]) 71 #=> { 72 #=> "00:00:00,000 --> 00:00:01,000": "Hello World!", 73 #=> "00:00:01,000 --> 00:00:03,000": "This is a test." 74 #=> } 75 """ 76 data = [] 77 for key, value in srt_data.items(): 78 data.append( 79 { 80 "start": key.split(" --> ")[0], 81 "end": key.split(" --> ")[1], 82 "string": value, 83 } 84 ) 85 merged_data = [] 86 for idx, item in enumerate(data): 87 if len(item["string"]) == 0: 88 continue 89 if ( 90 item["string"][-1] in statement_delimiters 91 or idx == len(data) - 1 92 ): 93 merged_data.append(item) 94 else: 95 data[idx + 1]["string"] = ( 96 item["string"] + " " + data[idx + 1]["string"] 97 ) 98 data[idx + 1]["start"] = item["start"] 99 out_data = {} 100 for item in merged_data: 101 out_data[item["start"] + " --> " + item["end"]] = item[ 102 "string" 103 ].strip() 104 return out_data 105 106 def write_srt(self, filepath: str, srt_data: dict): 107 """ 108 Writes SRT data to a file. 109 110 Arguments: 111 112 * **`filepath`**: `[str]` → The path to the SRT file to be written. 113 * **`srt_data`**: `[dict]` → The SRT data to be written to the file. 114 """ 115 idx = 0 116 with codecs.open(filepath, "w+", encoding="utf-8-sig") as out_file: 117 for key, value in srt_data.items(): 118 out_file.write(str(idx) + "\n") 119 out_file.write(key + "\n") 120 out_file.write(value + "\n") 121 out_file.write("\n") 122 idx += 1 123 124 125@type_enforced.Enforcer 126class Translator(SRT_Utils): 127 def __init__(self, key_path: str): 128 """ 129 Initializes the Translator class. 130 131 Arguments: 132 133 * **`key_path`**: `[str]` → The path to the Google Cloud API key. 134 * You can create a key by following the instructions [here](https://cloud.google.com/translate/docs/setup). 135 """ 136 self.__client__ = translate.Client.from_service_account_json(key_path) 137 self.__languages__ = self.__client__.get_languages() 138 self.__available_languages__ = set( 139 [language["language"] for language in self.__languages__] 140 ) 141 142 def show_languages(self): 143 """ 144 Prints a list of available languages. 145 """ 146 for language in self.__languages__: 147 print("{name} ({language})".format(**language)) 148 149 def translate(self, text: str, source_language: str, target_language: str): 150 """ 151 Translates a string of text from one language to another. 152 153 Arguments: 154 155 * **`text`**: `[str]` → The text to be translated. 156 * **`source_language`**: `[str]` → The language of the text to be translated. 157 * **`target_language`**: `[str]` → The language to translate the text to. 158 """ 159 160 return self.__client__.translate( 161 text, 162 target_language=target_language, 163 source_language=source_language, 164 ) 165 166 def srt_file_translator( 167 self, 168 source_file: str, 169 target_file: str, 170 source_language: str, 171 target_language: str, 172 statement_delimiters: list = [".", "?", "!"], 173 ): 174 """ 175 Reads an SRT file, translates the text, and writes the translated text to a new SRT file. 176 177 Arguments: 178 179 * **`source_file`**: `[str]` → The path to the SRT file to be translated. 180 * **`target_file`**: `[str]` → The path to the SRT file to be written. 181 * **`source_language`**: `[str]` → The language of the text to be translated. 182 * **`target_language`**: `[str]` → The language to translate the text to. 183 * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`. 184 """ 185 # General Assertions 186 assert ( 187 source_language in self.__available_languages__ 188 ), "Source language not supported. Use Translator.show_languages() to see available languages." 189 assert ( 190 target_language in self.__available_languages__ 191 ), "Target language not supported. Use Translator.show_languages() to see available languages." 192 assert source_file.endswith(".srt"), "Source file must be a .srt file" 193 assert target_file.endswith(".srt"), "Target file must be a .srt file" 194 195 # Parse SRT 196 srt_data = self.parse_srt( 197 filepath=source_file, statement_delimiters=statement_delimiters 198 ) 199 200 # Chunk SRT Data into 128 item chunks 201 srt_data_values = list(srt_data.values()) 202 chunked_values = [ 203 srt_data_values[i : i + 128] 204 for i in range(0, len(srt_data_values), 128) 205 ] 206 translations = [] 207 for chunk in chunked_values: 208 translations += [ 209 i["translatedText"] 210 for i in self.__client__.translate( 211 chunk, 212 target_language=target_language, 213 source_language=source_language, 214 ) 215 ] 216 output_srt_data = dict(zip(srt_data.keys(), translations)) 217 self.write_srt(filepath=target_file, srt_data=output_srt_data)
6@type_enforced.Enforcer 7class SRT_Utils: 8 def parse_srt( 9 self, filepath: str, statement_delimiters: list = [".", "?", "!"] 10 ): 11 """ 12 Parses an SRT file into a dictionary of statements. 13 The keys of the dictionary are the time stamps of the statements. 14 The values of the dictionary are the statements themselves. 15 Statements that are split across multiple lines are aggregated. 16 17 Arguments: 18 19 * **`filepath`**: `[str]` → The path to the SRT file to be parsed. 20 * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`. 21 """ 22 time_structure = re.compile( 23 "\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}" 24 ) 25 26 last_time = "00:00:00,000 --> 00:00:00,000" 27 srt_data = {} 28 29 with open(filepath) as filedata: 30 for line in filedata: 31 line_data = line[:-1] 32 if time_structure.match(line_data) is not None: 33 last_time = line_data 34 srt_data[last_time] = [] 35 else: 36 if last_time not in srt_data: 37 srt_data[last_time] = [] 38 srt_data[last_time].append(line_data) 39 for key, value in srt_data.items(): 40 srt_data[key] = " ".join(value[:-1] + [""]).strip() 41 42 srt_data = self.aggregate_statements( 43 srt_data=srt_data, statement_delimiters=statement_delimiters 44 ) 45 return srt_data 46 47 def aggregate_statements(self, srt_data: dict, statement_delimiters: list): 48 """ 49 Takes in a dictionary of SRT data and aggregates statements that are split across multiple lines. 50 Items are aggregated until a statement delimiter is found at the end of a line. 51 52 Arguments: 53 54 * **`srt_data`**: `[dict]` → The parsed SRT data. 55 * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`. 56 57 Returns: 58 59 * **`out_data`**: `[dict]` → The aggregated SRT data. 60 61 EG: 62 63 ```python 64 65 srt_data = { 66 "00:00:00,000 --> 00:00:01,000": "Hello World!", 67 "00:00:01,000 --> 00:00:02,000": "This is", 68 "00:00:02,000 --> 00:00:03,000": "a test." 69 } 70 71 Translator.aggregate_statements(srt_data=srt_data, statement_delimiters=[".", "?", "!"]) 72 #=> { 73 #=> "00:00:00,000 --> 00:00:01,000": "Hello World!", 74 #=> "00:00:01,000 --> 00:00:03,000": "This is a test." 75 #=> } 76 """ 77 data = [] 78 for key, value in srt_data.items(): 79 data.append( 80 { 81 "start": key.split(" --> ")[0], 82 "end": key.split(" --> ")[1], 83 "string": value, 84 } 85 ) 86 merged_data = [] 87 for idx, item in enumerate(data): 88 if len(item["string"]) == 0: 89 continue 90 if ( 91 item["string"][-1] in statement_delimiters 92 or idx == len(data) - 1 93 ): 94 merged_data.append(item) 95 else: 96 data[idx + 1]["string"] = ( 97 item["string"] + " " + data[idx + 1]["string"] 98 ) 99 data[idx + 1]["start"] = item["start"] 100 out_data = {} 101 for item in merged_data: 102 out_data[item["start"] + " --> " + item["end"]] = item[ 103 "string" 104 ].strip() 105 return out_data 106 107 def write_srt(self, filepath: str, srt_data: dict): 108 """ 109 Writes SRT data to a file. 110 111 Arguments: 112 113 * **`filepath`**: `[str]` → The path to the SRT file to be written. 114 * **`srt_data`**: `[dict]` → The SRT data to be written to the file. 115 """ 116 idx = 0 117 with codecs.open(filepath, "w+", encoding="utf-8-sig") as out_file: 118 for key, value in srt_data.items(): 119 out_file.write(str(idx) + "\n") 120 out_file.write(key + "\n") 121 out_file.write(value + "\n") 122 out_file.write("\n") 123 idx += 1
8 def parse_srt( 9 self, filepath: str, statement_delimiters: list = [".", "?", "!"] 10 ): 11 """ 12 Parses an SRT file into a dictionary of statements. 13 The keys of the dictionary are the time stamps of the statements. 14 The values of the dictionary are the statements themselves. 15 Statements that are split across multiple lines are aggregated. 16 17 Arguments: 18 19 * **`filepath`**: `[str]` → The path to the SRT file to be parsed. 20 * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`. 21 """ 22 time_structure = re.compile( 23 "\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}" 24 ) 25 26 last_time = "00:00:00,000 --> 00:00:00,000" 27 srt_data = {} 28 29 with open(filepath) as filedata: 30 for line in filedata: 31 line_data = line[:-1] 32 if time_structure.match(line_data) is not None: 33 last_time = line_data 34 srt_data[last_time] = [] 35 else: 36 if last_time not in srt_data: 37 srt_data[last_time] = [] 38 srt_data[last_time].append(line_data) 39 for key, value in srt_data.items(): 40 srt_data[key] = " ".join(value[:-1] + [""]).strip() 41 42 srt_data = self.aggregate_statements( 43 srt_data=srt_data, statement_delimiters=statement_delimiters 44 ) 45 return srt_data
Parses an SRT file into a dictionary of statements. The keys of the dictionary are the time stamps of the statements. The values of the dictionary are the statements themselves. Statements that are split across multiple lines are aggregated.
Arguments:
filepath
:[str]
→ The path to the SRT file to be parsed.statement_delimiters
:[list]
→ A list of characters that indicate the end of a statement. Defaults to[".", "?", "!"]
.
47 def aggregate_statements(self, srt_data: dict, statement_delimiters: list): 48 """ 49 Takes in a dictionary of SRT data and aggregates statements that are split across multiple lines. 50 Items are aggregated until a statement delimiter is found at the end of a line. 51 52 Arguments: 53 54 * **`srt_data`**: `[dict]` → The parsed SRT data. 55 * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`. 56 57 Returns: 58 59 * **`out_data`**: `[dict]` → The aggregated SRT data. 60 61 EG: 62 63 ```python 64 65 srt_data = { 66 "00:00:00,000 --> 00:00:01,000": "Hello World!", 67 "00:00:01,000 --> 00:00:02,000": "This is", 68 "00:00:02,000 --> 00:00:03,000": "a test." 69 } 70 71 Translator.aggregate_statements(srt_data=srt_data, statement_delimiters=[".", "?", "!"]) 72 #=> { 73 #=> "00:00:00,000 --> 00:00:01,000": "Hello World!", 74 #=> "00:00:01,000 --> 00:00:03,000": "This is a test." 75 #=> } 76 """ 77 data = [] 78 for key, value in srt_data.items(): 79 data.append( 80 { 81 "start": key.split(" --> ")[0], 82 "end": key.split(" --> ")[1], 83 "string": value, 84 } 85 ) 86 merged_data = [] 87 for idx, item in enumerate(data): 88 if len(item["string"]) == 0: 89 continue 90 if ( 91 item["string"][-1] in statement_delimiters 92 or idx == len(data) - 1 93 ): 94 merged_data.append(item) 95 else: 96 data[idx + 1]["string"] = ( 97 item["string"] + " " + data[idx + 1]["string"] 98 ) 99 data[idx + 1]["start"] = item["start"] 100 out_data = {} 101 for item in merged_data: 102 out_data[item["start"] + " --> " + item["end"]] = item[ 103 "string" 104 ].strip() 105 return out_data
Takes in a dictionary of SRT data and aggregates statements that are split across multiple lines. Items are aggregated until a statement delimiter is found at the end of a line.
Arguments:
srt_data
:[dict]
→ The parsed SRT data.statement_delimiters
:[list]
→ A list of characters that indicate the end of a statement. Defaults to[".", "?", "!"]
.
Returns:
out_data
:[dict]
→ The aggregated SRT data.
EG:
```python
srt_data = { "00:00:00,000 --> 00:00:01,000": "Hello World!", "00:00:01,000 --> 00:00:02,000": "This is", "00:00:02,000 --> 00:00:03,000": "a test." }
Translator.aggregate_statements(srt_data=srt_data, statement_delimiters=[".", "?", "!"])
=> {
=> "00:00:00,000 --> 00:00:01,000": "Hello World!",
=> "00:00:01,000 --> 00:00:03,000": "This is a test."
=> }
107 def write_srt(self, filepath: str, srt_data: dict): 108 """ 109 Writes SRT data to a file. 110 111 Arguments: 112 113 * **`filepath`**: `[str]` → The path to the SRT file to be written. 114 * **`srt_data`**: `[dict]` → The SRT data to be written to the file. 115 """ 116 idx = 0 117 with codecs.open(filepath, "w+", encoding="utf-8-sig") as out_file: 118 for key, value in srt_data.items(): 119 out_file.write(str(idx) + "\n") 120 out_file.write(key + "\n") 121 out_file.write(value + "\n") 122 out_file.write("\n") 123 idx += 1
Writes SRT data to a file.
Arguments:
filepath
:[str]
→ The path to the SRT file to be written.srt_data
:[dict]
→ The SRT data to be written to the file.
126@type_enforced.Enforcer 127class Translator(SRT_Utils): 128 def __init__(self, key_path: str): 129 """ 130 Initializes the Translator class. 131 132 Arguments: 133 134 * **`key_path`**: `[str]` → The path to the Google Cloud API key. 135 * You can create a key by following the instructions [here](https://cloud.google.com/translate/docs/setup). 136 """ 137 self.__client__ = translate.Client.from_service_account_json(key_path) 138 self.__languages__ = self.__client__.get_languages() 139 self.__available_languages__ = set( 140 [language["language"] for language in self.__languages__] 141 ) 142 143 def show_languages(self): 144 """ 145 Prints a list of available languages. 146 """ 147 for language in self.__languages__: 148 print("{name} ({language})".format(**language)) 149 150 def translate(self, text: str, source_language: str, target_language: str): 151 """ 152 Translates a string of text from one language to another. 153 154 Arguments: 155 156 * **`text`**: `[str]` → The text to be translated. 157 * **`source_language`**: `[str]` → The language of the text to be translated. 158 * **`target_language`**: `[str]` → The language to translate the text to. 159 """ 160 161 return self.__client__.translate( 162 text, 163 target_language=target_language, 164 source_language=source_language, 165 ) 166 167 def srt_file_translator( 168 self, 169 source_file: str, 170 target_file: str, 171 source_language: str, 172 target_language: str, 173 statement_delimiters: list = [".", "?", "!"], 174 ): 175 """ 176 Reads an SRT file, translates the text, and writes the translated text to a new SRT file. 177 178 Arguments: 179 180 * **`source_file`**: `[str]` → The path to the SRT file to be translated. 181 * **`target_file`**: `[str]` → The path to the SRT file to be written. 182 * **`source_language`**: `[str]` → The language of the text to be translated. 183 * **`target_language`**: `[str]` → The language to translate the text to. 184 * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`. 185 """ 186 # General Assertions 187 assert ( 188 source_language in self.__available_languages__ 189 ), "Source language not supported. Use Translator.show_languages() to see available languages." 190 assert ( 191 target_language in self.__available_languages__ 192 ), "Target language not supported. Use Translator.show_languages() to see available languages." 193 assert source_file.endswith(".srt"), "Source file must be a .srt file" 194 assert target_file.endswith(".srt"), "Target file must be a .srt file" 195 196 # Parse SRT 197 srt_data = self.parse_srt( 198 filepath=source_file, statement_delimiters=statement_delimiters 199 ) 200 201 # Chunk SRT Data into 128 item chunks 202 srt_data_values = list(srt_data.values()) 203 chunked_values = [ 204 srt_data_values[i : i + 128] 205 for i in range(0, len(srt_data_values), 128) 206 ] 207 translations = [] 208 for chunk in chunked_values: 209 translations += [ 210 i["translatedText"] 211 for i in self.__client__.translate( 212 chunk, 213 target_language=target_language, 214 source_language=source_language, 215 ) 216 ] 217 output_srt_data = dict(zip(srt_data.keys(), translations)) 218 self.write_srt(filepath=target_file, srt_data=output_srt_data)
128 def __init__(self, key_path: str): 129 """ 130 Initializes the Translator class. 131 132 Arguments: 133 134 * **`key_path`**: `[str]` → The path to the Google Cloud API key. 135 * You can create a key by following the instructions [here](https://cloud.google.com/translate/docs/setup). 136 """ 137 self.__client__ = translate.Client.from_service_account_json(key_path) 138 self.__languages__ = self.__client__.get_languages() 139 self.__available_languages__ = set( 140 [language["language"] for language in self.__languages__] 141 )
Initializes the Translator class.
Arguments:
key_path
:[str]
→ The path to the Google Cloud API key.- You can create a key by following the instructions here.
143 def show_languages(self): 144 """ 145 Prints a list of available languages. 146 """ 147 for language in self.__languages__: 148 print("{name} ({language})".format(**language))
Prints a list of available languages.
150 def translate(self, text: str, source_language: str, target_language: str): 151 """ 152 Translates a string of text from one language to another. 153 154 Arguments: 155 156 * **`text`**: `[str]` → The text to be translated. 157 * **`source_language`**: `[str]` → The language of the text to be translated. 158 * **`target_language`**: `[str]` → The language to translate the text to. 159 """ 160 161 return self.__client__.translate( 162 text, 163 target_language=target_language, 164 source_language=source_language, 165 )
Translates a string of text from one language to another.
Arguments:
text
:[str]
→ The text to be translated.source_language
:[str]
→ The language of the text to be translated.target_language
:[str]
→ The language to translate the text to.
167 def srt_file_translator( 168 self, 169 source_file: str, 170 target_file: str, 171 source_language: str, 172 target_language: str, 173 statement_delimiters: list = [".", "?", "!"], 174 ): 175 """ 176 Reads an SRT file, translates the text, and writes the translated text to a new SRT file. 177 178 Arguments: 179 180 * **`source_file`**: `[str]` → The path to the SRT file to be translated. 181 * **`target_file`**: `[str]` → The path to the SRT file to be written. 182 * **`source_language`**: `[str]` → The language of the text to be translated. 183 * **`target_language`**: `[str]` → The language to translate the text to. 184 * **`statement_delimiters`**: `[list]` → A list of characters that indicate the end of a statement. Defaults to `[".", "?", "!"]`. 185 """ 186 # General Assertions 187 assert ( 188 source_language in self.__available_languages__ 189 ), "Source language not supported. Use Translator.show_languages() to see available languages." 190 assert ( 191 target_language in self.__available_languages__ 192 ), "Target language not supported. Use Translator.show_languages() to see available languages." 193 assert source_file.endswith(".srt"), "Source file must be a .srt file" 194 assert target_file.endswith(".srt"), "Target file must be a .srt file" 195 196 # Parse SRT 197 srt_data = self.parse_srt( 198 filepath=source_file, statement_delimiters=statement_delimiters 199 ) 200 201 # Chunk SRT Data into 128 item chunks 202 srt_data_values = list(srt_data.values()) 203 chunked_values = [ 204 srt_data_values[i : i + 128] 205 for i in range(0, len(srt_data_values), 128) 206 ] 207 translations = [] 208 for chunk in chunked_values: 209 translations += [ 210 i["translatedText"] 211 for i in self.__client__.translate( 212 chunk, 213 target_language=target_language, 214 source_language=source_language, 215 ) 216 ] 217 output_srt_data = dict(zip(srt_data.keys(), translations)) 218 self.write_srt(filepath=target_file, srt_data=output_srt_data)
Reads an SRT file, translates the text, and writes the translated text to a new SRT file.
Arguments:
source_file
:[str]
→ The path to the SRT file to be translated.target_file
:[str]
→ The path to the SRT file to be written.source_language
:[str]
→ The language of the text to be translated.target_language
:[str]
→ The language to translate the text to.statement_delimiters
:[list]
→ A list of characters that indicate the end of a statement. Defaults to[".", "?", "!"]
.