-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathtranscript_check.py
More file actions
executable file
·164 lines (128 loc) · 5.22 KB
/
transcript_check.py
File metadata and controls
executable file
·164 lines (128 loc) · 5.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/python
import sys, getopt, os, subprocess
csv_path = ""
model_dir = ""
model_path = ""
alphabet_path = ""
lm_path = ""
trie_path = ""
threshold = 0.3
min_word_diff = 2
start_line = 1
def runScript():
global csv_path, model_dir, model_path, alphabet_path, lm_path, trie_path, threshold, min_word_diff, start_line
try:
opts, args = getopt.getopt(sys.argv[1:], [], ["input=", "model-dir=", "model=", "alphabet=", "lm=", "trie=", "threshold=", "min-word-diff=", "start-line="])
except getopt.GetoptError:
print(os.path.basename(sys.argv[0]) + " --input <CSV path> --model-dir <dir> [--threshold <float>] [--min-word-diff <int>] [--start-line <int>]\n")
print(os.path.basename(sys.argv[0]) + " --input <CSV path> --model <model> --alphabet <alphabet> --lm <lm> --trie <trie> " + \
"[--threshold <float>] [--min-word-diff <int>] [--start-line <int>]")
print("\n--model-dir: A directory containing a mode, alphabet, language model and trie")
print("--threshold: Percentage difference in trancript word count required to flag up a file (default is 0.3)")
print("--min-word-diff: Minimum number of words to have changed in order to flag up a file (default is 2)")
print("--start-line: Start processing at a specific line in the file. The first line starts at 1.")
sys.exit(2)
for opt, arg in opts:
if opt == "-h":
print(os.path.basename(sys.argv[0]) + " --input <CSV path> --model-dir <dir> [--threshold <float>] [--min-word-diff <int>] [--start-line <int>]\n")
print(os.path.basename(sys.argv[0]) + " --input <CSV path> --model <model> --alphabet <alphabet> --lm <lm> --trie <trie> " + \
"[--threshold <float>] [--min-word-diff <int>] [--start-line <int>]")
print("\n--model-dir: A directory containing a mode, alphabet, language model and trie")
print("--threshold: Percentage difference in trancript word count required to flag up a file (default is 0.3)")
print("--min-word-diff: Minimum number of words to have changed in order to flag up a file (default is 2)")
print("--start-line: Start processing at a specific line in the file. The first line starts at 1.")
sys.exit()
elif opt == "--input":
csv_path = arg
elif opt == "--model-dir":
model_dir = arg
elif opt == "--model":
model_path = arg
elif opt == "--alphabet":
alphabet_path = arg
elif opt == "--lm":
lm_path = arg
elif opt == "--trie":
trie_path = arg
elif opt == "--threshold":
threshold = float(arg)
elif opt == "--min-word-diff":
min_word_diff = int(arg)
elif opt == "--start-line":
start_line = int(arg)
if start_line <= 0:
start_line = 1
if model_dir != "":
# Figure out paths from directory
if trie_path == "" and os.path.exists(os.path.join(model_dir,"trie")):
trie_path = os.path.join(model_dir,"trie")
if lm_path == "":
lm_path = firstFileWithExtension("binary",model_dir)
if alphabet_path == "":
alphabet_path = firstFileWithExtension("txt",model_dir)
if model_path == "":
model_path = firstFileWithExtension("pbmm",model_dir)
# Use non-memory-mapped model instead
if model_path == "":
model_path = firstFileWithExtension("pb",model_dir)
# Sanity checks
if not os.path.exists(trie_path):
print("Trie not found")
exit(1)
if not os.path.exists(lm_path):
print("Language model not found")
exit(1)
if not os.path.exists(alphabet_path):
print("Alphabet not found")
exit(1)
if not os.path.exists(model_path):
print("Model not found")
exit(1)
# Parse CSV
with open(csv_path) as f:
line_no = 0
for line in f:
line_no += 1
if line_no < start_line:
continue;
components = line.split(",")
if len(components) != 3:
print("Invalid line")
continue
if os.path.exists(components[0]):
expected_transcript = components[2].strip()
actual_transcript = transcribe(components[0]).strip()
if not compareTranscripts(expected_transcript, actual_transcript):
print("***")
print("File: " + components[0])
print("Line: " + str(line_no))
print("Expected transcript: " + expected_transcript)
print("Actual transcript: " + actual_transcript)
print("***")
def firstFileWithExtension(ext, folder):
for root, dirs, files in os.walk(folder):
for filename in files:
if os.path.splitext(filename)[1].lower() == "." + ext.lower():
return os.path.join(folder,filename)
return ""
def transcribe(wav_path):
global model_path, lm_path, trie_path, alphabet_path
return str(subprocess.check_output(["deepspeech", "--model", model_path, "--lm", lm_path, "--trie", trie_path, \
"--alphabet", alphabet_path, "--audio", wav_path], stderr=subprocess.DEVNULL),"utf-8")
def compareTranscripts(expected, actual):
global min_word_diff
# Flag up if number of words is significantly different from expected
expected_words = len(expected.split())
actual_words = len(actual.split())
if expected_words == 0 or actual_words == 0:
return False
if actual_words == expected_words:
return True
if actual_words > expected_words:
if actual_words - expected_words < min_word_diff:
return True
return (actual_words - expected_words) / expected_words < threshold
if expected_words - actual_words < min_word_diff:
return True
return (expected_words - actual_words) / expected_words < threshold
runScript()