qurator-spk · b2m · May 27, 2021 · May 27, 2021 · Jun 8, 2021 · Jun 11, 2021
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -20,4 +20,4 @@ workflows:
       - test:
           matrix:
             parameters:
-              python-version: ["3.5", "3.6", "3.7", "3.8", "3.9"]
+              python-version: ["3.6", "3.7", "3.8", "3.9"]
diff --git a/.screenshots/dinglehopper.png b/.screenshots/dinglehopper.png
diff --git a/README.md b/README.md
@@ -34,21 +34,27 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
   dinglehopper detects if GT/OCR are ALTO or PAGE XML documents to extract
   their text and falls back to plain text if no ALTO or PAGE is detected.
 
-  The files GT and OCR are usually a ground truth document and the result of
-  an OCR software, but you may use dinglehopper to compare two OCR results.
-  In that case, use --no-metrics to disable the then meaningless metrics and
-  also change the color scheme from green/red to blue.
+   The files GT and OCR are usually a ground truth document and the result of
+  an OCR software, but you may use dinglehopper to compare two OCR results. In
+  that case, use --metrics='' to disable the then meaningless metrics and also
+  change the color scheme from green/red to blue.
 
   The comparison report will be written to $REPORT_PREFIX.{html,json}, where
-  $REPORT_PREFIX defaults to "report". The reports include the character
-  error rate (CER) and the word error rate (WER).
+  $REPORT_PREFIX defaults to "report". Depending on your configuration the
+  reports include the character error rate (CA|CER), the word error rate (WA|WER),
+  the bag of chars accuracy (BoC), and the bag of words accuracy (BoW).
+  The metrics can be chosen via a comma separated combination of their acronyms
+  like "--metrics=ca,wer,boc,bow".
+
+  The html report can be enabled/disabled using --html/--no-html.
 
   By default, the text of PAGE files is extracted on 'region' level. You may
   use "--textequiv-level line" to extract from the level of TextLine tags.
 
 Options:
-  --metrics / --no-metrics  Enable/disable metrics and green/red
+  --metrics                 Enable different metrics like ca|cer, wa|wer, boc and bow.
   --textequiv-level LEVEL   PAGE TextEquiv level to extract text from
+  --html / --no-html        Enabling/disabling html report.
   --progress                Show progress bar
   --help                    Show this message and exit.
 ~~~
@@ -80,12 +86,13 @@ The OCR-D processor has these parameters:
 
 | Parameter                 | Meaning                                                             |
 | ------------------------- | ------------------------------------------------------------------- |
-| `-P metrics false`        | Disable metrics and the green-red color scheme (default: enabled)   |
+| `-P metrics cer,wer`      | Enable character error rate and word error rate (default)           |
 | `-P textequiv_level line` | (PAGE) Extract text from TextLine level (default: TextRegion level) |
+| `-P html false`           | Enabling/disabling html report (default: enabled).                  |
 
 For example:
 ~~~
-ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics false
+ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer -P html false
 ~~~
 
 Developer information

diff --git a/qurator/dinglehopper/__init__.py b/qurator/dinglehopper/__init__.py
@@ -1,5 +0,0 @@
-from .ocr_files import *
-from .extracted_text import *
-from .character_error_rate import *
-from .word_error_rate import *
-from .align import *

diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py
@@ -1,10 +1,11 @@
-from .edit_distance import *
+from .edit_distance import seq_editops
+from .normalize import chars_normalized
 
 
 def align(t1, t2):
     """Align text."""
-    s1 = list(grapheme_clusters(unicodedata.normalize("NFC", t1)))
-    s2 = list(grapheme_clusters(unicodedata.normalize("NFC", t2)))
+    s1 = chars_normalized(t1)
+    s2 = chars_normalized(t2)
     return seq_align(s1, s2)
 
 

diff --git a/qurator/dinglehopper/character_error_rate.py b/qurator/dinglehopper/character_error_rate.py