ungeschneuer · ungeschneuer · Jun 26, 2022 · Jun 25, 2022 · Jun 26, 2022
diff --git a/parser/text_parse.py b/parser/text_parse.py
@@ -4,191 +4,169 @@
 import xml_processing
 from database import add_to_queue, check_newness
 
+# Beginn des Dokumentes finden mit Rechtschreibfehlern. 
+def find_beginn(text):
 
-
-
-# Normalisierungfunktion von nyt_first_said
-def normalize(raw_word):
-
-    # Ausfiltern von weiteren Zeichen im Testlauf
-    regexexp = re.compile('-{2,}')
-
-    # Entfernen von Zeichen (Wie schwer kann das sein??!!)
-    punctuation = r"""#"!$%&'())*+,‚."/:;<=>?@[\]^_`{|}~“”„"""
-    stripped_word = raw_word.translate(str.maketrans('', '', punctuation))
-
-
-    # Check ob Spiegelstrich einen Silbentrennung ist oder tatsächlich ganzes Wort
-    if (
-        '-' in stripped_word
-        and not stripped_word.endswith('-')
-        and not stripped_word.startswith('-')
-	        and regexexp.search(stripped_word)
-    ):
-        replaced = re.sub(regexexp, '-', stripped_word)
-        return normalize(replaced)
-
-    if stripped_word.endswith('ʼ') or stripped_word.endswith('’'):
-        stripped_word = stripped_word[:-1]
-
-    return stripped_word
-
-
-# Check ob ein valides Wort und weitere Korrigierung
-def ok_word(s):
-# Entfernung hier von html, bzw, und, oder, weil Aufzählungen mit Bindestrich und domains nicht gut rausgefiltert werden.
-    if len(s) < 5 or s.endswith(('ts', 'html', 'de', 'bzw', 'oder', 'und', 'wie', 'pdf')) or s.startswith('www') or s[-1].isupper(): 
-        return False
-
-    return (not any(i.isdigit() or i in '(.@/#-_§ ' for i in s))
-
-# Normalisiert das Wort, überprüft ob es schon im Speicher ist und fügt es der Queue hinzu
-def check_word(word, id):
-    norm_word = normalize(word)
-
-    if ok_word(norm_word):
-        if check_newness(norm_word, id):
-            add_to_queue(norm_word, id)
-            return True
-        else:
-            return False
+    if text.find('Beginn:') == -1:
+        text = text[text.find('Beginn'):]
     else:
-        return False
+        text = text[text.find('Beginn'):]
+
+    return text
 
+# Silbentrennung rückgängig machen. 
+def dehyphenate(text):
 
-# Filtert aus XML Datei die tatsächlichen Wortbeiträge
-def get_wortbeitraege(xml_file):
-
-    text = xml_processing.getText(xml_file)
-    if not text:
-        return False
+    lines = text.split('\n')
+    for num, line in enumerate(lines):
+        if line.endswith('-'):
+            # the end of the word is at the start of next line
+            end = lines[num+1].split()[0]
+            # we remove the - and append the end of the word
+            lines[num] = line[:-1] + end
+            # and remove the end of the word and possibly the 
+            # following space from the next line
+            lines[num+1] = lines[num+1][len(end)+1:]
 
-    sanitized = []
-    regex_url = '(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'
+    return '\n'.join(lines)
 
-    for sentence in text:
-        # Encoding funktioniert nicht komplett, darum sanitizing
-        sentence = sentence.replace(u'\xa0', u' ') # Sonderzeichen entfernen
-        sentence = sentence.replace('\n', ' ') # Zeilenumbrüche
-        sentence = sentence.replace('  ', ' ') # Doppelte Leerzeichen
-        sentence = re.sub(regex_url, '', sentence) # URL-Filter
-        sanitized.append(sentence)
+# Cleaning vor dem Wordsplitting
+def pre_split_clean(text):
 
-    return sanitized
+    # Encoding funktioniert nicht komplett, darum sanitizing
+    text = text.replace(u'\xa0', u' ') # Sonderzeichen entfernen
+    text = text.replace('  ', ' ') # Doppelte Leerzeichen
 
+    regex_url = '(http|ftp|https|http)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'
+    text = re.sub(regex_url, '', text) # URL-Filter
 
-# TODO Dehyphenation auf Line Level
-# https://stackoverflow.com/questions/43666790/python-how-do-i-merge-hyphenated-words-with-newlines
+    return text
+
+# Wörter splitten am Leerzeichen
 def wordsplitter(text):
     words = []
 
     try:
-        for sentence in text:
-            sentence_words = sentence.split()
-
-            # When uppercase letter in word split it
-            for word in sentence_words:
-                words += re.split('(?=[A-Z])', word)
+        words = text.split()
 
-        if 'Beginn:' in words:
-            words = words[words.index('Beginn:')+1:]
-        elif 'Beginn' in words:
-            words = words[words.index('Beginn')+1:]
     except Exception as e:
         logging.exception(e)
         exit()
 
     return words
 
+# Wenn Aufzählung, werden die nächsten zwei Worte entfernt.
+def de_enumaration(words):
+
+    clean_words = []
+    skip = 0
+
+    for word in words:
+        if skip > 0:
+            skip -= 1
+            continue
+
+        if word.endswith('-'):
+            skip = 2
+        else:
+            clean_words.append(word)
+
+    return clean_words
+
 
 def wordsfilter(words, id):  
     wordnum = 0
-    first_half = ""
-    skip = False
-    possible_hyphenation = False
 
     # Wort hat Buchstaben
     regchar = re.compile('([A-Z])|([a-z])\w+')
-    # Wort hat nicht gleiche Zeichen hintereinander
+    # Wort hat gleiche Zeichen hintereinander
     regmul = re.compile('([A-z])\1{3,}')
+    # Wort hat nicht nur am Anfag Großbuchstaben
+    regsmall = re.compile('[A-z]{1}[a-z]*[A-Z]+[a-z]*')
 
     for word in words:
-        if skip:
-            skip = False
-            continue
-        if regchar.search(word) and not regmul.search(word) and not ('http' in word):
-
-            # Checkt ob Silbentrennung Wörter getrennt hat
-            if possible_hyphenation:
-
-                # Wenn zweite Hälfte groß geschrieben ist, ist es ein neues Wort und beide werden einzelnd weiter geschickt.
-                if word[0].isupper() or word.startswith('-'):
-                    if check_word(first_half, id):
-                        wordnum += 1
-                    possible_hyphenation = False
-                    #Gleich aussortieren, wenn Wort mit Strich anfängt
-                    if word.startswith('-'):
-                        continue
-                # Aufzählung raus sortieren    
-                elif word == 'und' or word == 'oder' or word == 'bzw':
-                    if check_word(first_half, id):
-                        wordnum += 1
-                    possible_hyphenation = False
-                    # Nächsten Teil der Aufzählung gleich mit entfernen
-                    skip = True
-                    continue
-                else:
-                    # Wenn zweite Hälfte klein, dann kombinieren der beiden Wörter
-                    combined = first_half.strip('-') + word
-                    possible_hyphenation = False
-
-                    # TODO Check ob es ein tatsächliches Wort ist
-
-                    word = combined
-
-
-            # Wenn Wort mit Spiegelstrich endet dann zurückhalten und in der nächsten Iteration testen ob Silbentrennung
-            if word.endswith('-') and word.count('-') < 2:
-                first_half = word
-                possible_hyphenation = True
-                continue
+        if regchar.search(word) and not regmul.search(word) and not regsmall.search(word):
 
-            # Wortaufzählung entfernen
-            if word.startswith('-') or word.startswith('‑'):
+            # Enfernen von sonst nicht filterbaren Aufzählungen
+            if word.endswith('-,') or word.endswith('-') or word.startswith('-'):
                 continue
 
-
-            # Zusammefassung oder binäre Ansprache
+            # Trennung von Bundestrich-Kompositionen
             if '/' in word:
                 splitted = word.split('/')
                 word = splitted[0]
 
                 if check_word(splitted[1], id):
                     wordnum += 1
-
 
             if check_word(word, id):
                 wordnum += 1
 
     return wordnum
 
+# Hauptfunktion des Moduls für die Aufbereitung und Trennung der Wörter
+def process_woerter (xml_file, id):
 
+    raw_text = xml_processing.getText(xml_file)
 
+    if not raw_text:
+        return False
+
+    text = find_beginn(raw_text)
+    text = pre_split_clean(text)
+    text = dehyphenate(text)
 
-def process_woerter (xml_file, id):
+    words = wordsplitter(text)
+    words = de_enumaration(words)
 
-    raw_results = get_wortbeitraege(xml_file)
+    return(wordsfilter(words, id))
 
-    if not raw_results:
-        return 0
-
-    words = wordsplitter(raw_results)
 
-    return(wordsfilter(words, id))
+# Normalisierung vor Datenbank-Abgleich des Wortes
+def normalize(raw_word):
+
+    # Ausfiltern von weiteren Zeichen im Testlauf
+    regexexp = re.compile('-{2,}')
+
+    # Entfernen von Zeichen (Wie schwer kann das sein??!!)
+    punctuation = r"""#"!$%&'())*+,‚.":;<=>?@[\]^_`{|}~“”„"""
+    stripped_word = raw_word.translate(str.maketrans('', '', punctuation))
+
+    if stripped_word.endswith('ʼ') or stripped_word.endswith('’'):
+        stripped_word = stripped_word[:-1]
+
+    return stripped_word
+
+
+# Check ob es ein valides Wort ist
+def ok_word(s):
+# Entfernung hier von html, bzw, und, oder, weil Aufzählungen mit Bindestrich und domains nicht gut rausgefiltert werden.
+    if len(s) < 5 or s.endswith(('ts', 'html', 'de', 'bzw', 'oder', 'und', 'wie', 'pdf')) or s.startswith('www') or s[-1].isupper(): 
+        return False
+
+    return (not any(i.isdigit() or i in '(.@/#-_§ ' for i in s))
+
+# Normalisiert das Wort, überprüft ob es schon im Speicher ist und fügt es der Queue hinzu
+def check_word(word, id):
+    norm_word = normalize(word)
+
+    if ok_word(norm_word):
+        if check_newness(norm_word, id):
+            add_to_queue(norm_word, id)
+            return True
+        else:
+            return False
+    else:
+        return False
+
 
 if __name__ == "__main__":
-    file = '/Users/marcel/Documents/2021/plenum_first_said.nosync/parser/archive/5445.xml'
+    file = '#'
     root = xml_processing.parse(file)
-    text = get_wortbeitraege(root)
+    text = xml_processing.getText(root)
+    text = find_beginn(text)
+    text = dehyphenate(text)
+    text = pre_split_clean(text)
     words = wordsplitter(text)
-    print(words)
+    words = de_enumaration(words)
+    print(words)
diff --git a/parser/xml_processing.py b/parser/xml_processing.py
@@ -25,7 +25,7 @@ def save(id, current_xml):
 
     return filename
 
-
+# XML Dokument bekommen hinter der ID
 def get(id):
 
     url = 'https://search.dip.bundestag.de/api/v1/plenarprotokoll-text/' + str(id) + '?apikey=' + api_key + '&format=xml'
@@ -45,26 +45,26 @@ def parse(filename):
     tree = ET.parse(filename)
     return tree.getroot()
 
-
+#  Auf verschiedene Arten der Formatierung eingehen und als String ausgeben.
 def getText(xml_file):
 
-    text = []
+    text_array = []
     klassen = ['J', '1','O', 'J_1', 'T']
 
     #Checken ob neues Format und Text rausziehen
     for p in xml_file.iter("p"):
         if any(value in p.attrib.values() for value in klassen):
-            text.append(p.text)
+            text_array.append(p.text)
 
     # Altes Format bekommen
-    if not text: 
+    if not text_array: 
         if xml_file.findall('text'):
-            text.append(xml_file.find('text').text)
+            text_array.append(xml_file.find('text').text)
         if xml_file.findall('TEXT'):
-            text.append(xml_file.find('TEXT').text)
+            text_array.append(xml_file.find('TEXT').text)
 
-    if not text:
+    if not text_array:
         return False
     else:        
-        return text
+        return ''.join(text_array)