aboutcode-org · JonoYang · Jan 13, 2026 · Nov 12, 2025 · Nov 12, 2025 · Nov 20, 2025
diff --git a/.gitignore b/.gitignore
@@ -116,3 +116,7 @@ selenium
 rpmdb.sqlite-*
 /.ruff_cache/
 .env
+
+# Ignore gibberish detector model, this is trained on configure and build and
+# should not be committed
+gib_model.pki
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -13,6 +13,14 @@ Next release
 - Replace unmaintained ``toml`` library with ``tomllib`` / ``tomli``.
   https://github.com/aboutcode-org/scancode-toolkit/issues/4532
 
+- Add gibberish detection to copyright scanning. This is done using a
+  2-character Markov chain. A new CLI command,
+  ``scancode-train-gibberish-model``, has been added to regenerate the model
+  used by the detector.
+  https://github.com/aboutcode-org/scancode-toolkit/pull/4610
+  https://github.com/aboutcode-org/scancode-toolkit/issues/2402
+
+
 v32.4.1 - 2025-07-23
 --------------------
 

diff --git a/Dockerfile b/Dockerfile
@@ -38,11 +38,13 @@ WORKDIR /scancode-toolkit
 COPY . /scancode-toolkit
 
 # Initial configuration using ./configure, scancode-reindex-licenses to build
-# the base license index and scancode-reindex-package-patterns to build the 
-# package patterns cache
+# the base license index, scancode-reindex-package-patterns to build the package
+# patterns cache, and scancode-train-gibberish-model to train the Markov chain
+# model used for gibberish detection.
 RUN ./configure \
  && ./venv/bin/scancode-reindex-licenses \
- && ./venv/bin/scancode-reindex-package-patterns
+ && ./venv/bin/scancode-reindex-package-patterns \
+ && ./venv/bin/scancode-train-gibberish-model
 
 # Add scancode to path
 ENV PATH=/scancode-toolkit:$PATH

diff --git a/configure b/configure
@@ -319,6 +319,6 @@ find_python
 create_virtualenv "$VIRTUALENV_DIR"
 install_packages "$CFG_REQUIREMENTS"
 . "$CFG_BIN_DIR/activate"
-
+"$CFG_BIN_DIR/scancode-train-gibberish-model"
 
 set +e
diff --git a/configure.bat b/configure.bat
@@ -161,7 +161,7 @@ if %ERRORLEVEL% neq 0 (
     %CFG_QUIET% ^
     %PIP_EXTRA_ARGS% ^
     %CFG_REQUIREMENTS%
-
+"%CFG_BIN_DIR%\scancode-train-gibberish-model"
 
 @rem ################################
 :create_bin_junction

diff --git a/docs/source/reference/scancode-cli/cli-scancode-train-gibberish-model.rst b/docs/source/reference/scancode-cli/cli-scancode-train-gibberish-model.rst
@@ -0,0 +1,65 @@
+.. _cli-scancode-train-gibberish-model:
+
+ScanCode train gibberish model
+==============================
+
+ScanCode uses a 2-character Markov chain to perform gibberish detection on text.
+At a high level, it detects gibberish strings by seeing if a sequence of letters
+is part or a whole word, two letters at a time. It does this by checking how
+likely it is to go from one letter to another. The probabilities of going from
+one letter to another are determined by a model that has been trained on a large
+set of valid text, where it counts each transition between letters and computes
+a probability based off of that. These probabilities and thresholds are stored
+in a model that is saved to a Python pickle.
+
+The training corpus for the gibberish detector can be found in
+``src/textcode/data/gibberish/``.
+
+``big.txt`` contains the main source of valid words that the gibberish detector
+model is trained on.
+
+``good.txt`` and ``bad.txt`` are used to determine the average threshold, where
+any letter transition whose average transition probability falls below this
+threshold is classified as gibberish.
+
+
+Usage: ``scancode-train-gibberish-model [OPTIONS]``
+
+Quick Reference
+---------------
+
+  --big FILE   Text file containing main training corpus for the gibberish
+               detector
+  --good FILE  Text file containing text considered to be not gibberish (good)
+  --bad FILE   Text file containing text considered to be gibberish (bad)
+  -h, --help   Show this message and exit.
+
+----
+
+.. _cli-scancode-train-gibberish-model-big-option:
+
+``--big`` option
+^^^^^^^^^^^^^^^^
+
+The ``--big`` option allows the user to use a different text file to train the
+gibberish detector model.
+
+.. _cli-scancode-train-gibberish-model-good-option:
+
+``--good`` option
+^^^^^^^^^^^^^^^^^
+
+The ``--good`` option allows the user to use a different text file containing
+strings considered to be valid copyrights. This option is used to adjust the
+average transition probability threshold that determines whether or not a string
+is gibberish.
+
+.. _cli-scancode-train-gibberish-model-bad-option:
+
+``--bad`` option
+^^^^^^^^^^^^^^^^
+
+The ``--bad`` option allows the user to use a different text file containing
+strings considered to be invalid copyrights. This option is used to adjust the
+average transition probability threshold that determines whether or not a string
+is gibberish.
diff --git a/docs/source/reference/scancode-cli/index.rst b/docs/source/reference/scancode-cli/index.rst
@@ -88,3 +88,4 @@ For more details into the post-scan CLI options, see :ref:`cli-post-scan-options
    cli-extractcode
    cli-scancode-reindex-licenses
    cli-scancode-license-data
+   cli-scancode-train-gibberish-model
diff --git a/etc/release/scancode-create-pypi-wheel.sh b/etc/release/scancode-create-pypi-wheel.sh
@@ -20,6 +20,7 @@ set -e
 ./configure --dev
 venv/bin/scancode-reindex-licenses
 venv/bin/scancode-reindex-package-patterns
+venv/bin/scancode-train-gibberish-model
 
 python_tag=$( python -c "import platform;print(f\"cp{''.join(platform.python_version_tuple()[:2])}\")" )
 

diff --git a/etc/release/scancode-create-release-app-linux.sh b/etc/release/scancode-create-release-app-linux.sh
@@ -66,6 +66,7 @@ cp -r etc/thirdparty $release_dir/etc
 ./configure --dev
 venv/bin/scancode-reindex-licenses
 venv/bin/scancode-reindex-package-patterns
+venv/bin/scancode-train-gibberish-model
 venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version
 
 cp -r \

diff --git a/etc/release/scancode-create-release-app-macos.sh b/etc/release/scancode-create-release-app-macos.sh
@@ -64,6 +64,7 @@ cp -r etc/thirdparty $release_dir/etc
 ./configure --dev
 venv/bin/scancode-reindex-licenses
 venv/bin/scancode-reindex-package-patterns
+venv/bin/scancode-train-gibberish-model
 venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version
 
 cp -r \

diff --git a/etc/release/scancode-create-release-app-windows.sh b/etc/release/scancode-create-release-app-windows.sh
@@ -63,6 +63,7 @@ cp -r etc/thirdparty $release_dir/etc
 ./configure --dev
 venv/bin/scancode-reindex-licenses
 venv/bin/scancode-reindex-package-patterns
+venv/bin/scancode-train-gibberish-model
 venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version
 
 cp -r \

diff --git a/setup-mini.cfg b/setup-mini.cfg
@@ -162,6 +162,7 @@ console_scripts =
     regen-package-docs = packagedcode.regen_package_docs:regen_package_docs
     add-required-phrases = licensedcode.required_phrases:add_required_phrases
 	gen-new-required-phrases-rules = licensedcode.required_phrases:gen_required_phrases_rules
+    scancode-train-gibberish-model = textcode.train_gibberish_model:train_gibberish_model
 
 # These are configurations for ScanCode plugins as setuptools entry points.
 # Each plugin entry hast this form:

diff --git a/setup.cfg b/setup.cfg
@@ -164,6 +164,7 @@ console_scripts =
     regen-package-docs = packagedcode.regen_package_docs:regen_package_docs
     add-required-phrases = licensedcode.required_phrases:add_required_phrases
 	gen-new-required-phrases-rules = licensedcode.required_phrases:gen_required_phrases_rules
+    scancode-train-gibberish-model = textcode.train_gibberish_model:train_gibberish_model
 
 # These are configurations for ScanCode plugins as setuptools entry points.
 # Each plugin entry hast this form:

diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py
@@ -26,6 +26,7 @@
 from pygmars.tree import Tree
 
 from cluecode import copyrights_hint
+from textcode.gibberish import Gibberish
 from textcode.markup import strip_known_markup_from_text
 
 # Tracing flags
@@ -60,6 +61,7 @@ def logger_debug(*args):
 if TRACE_DEEP:
     logger_debug = print
 
+
 """
 Detect and collect copyright statements.
 
@@ -197,6 +199,7 @@ def detect_copyrights_from_lines(
         if TRACE or TRACE_DEEP:
             logger_debug(f'\n========================================================================')
             logger_debug(f'detect_copyrights_from_lines: processing candidate_lines group:')
+
             for can in candidate_lines:
                 logger_debug(f'  {can}')
 
@@ -4265,6 +4268,8 @@ def strip_balanced_edge_parens(s):
 
 is_only_digit_and_punct = re.compile('^[^A-Za-z]+$').match
 
+gibberish_detector = Gibberish()
+
 
 def is_candidate(prepared_line):
     """
@@ -4282,6 +4287,11 @@ def is_candidate(prepared_line):
 
         return False
 
+    if gibberish_detector.detect_gibberish(prepared_line):
+        if TRACE:
+            logger_debug(f'is_candidate: gibberish_detector.detect_gibberish:\n{prepared_line!r}')
+        return False
+
     if copyrights_hint.years(prepared_line):
         return True
     else:

diff --git a/src/textcode/data/gibberish/bad.txt b/src/textcode/data/gibberish/bad.txt
@@ -0,0 +1,5 @@
+zxcvwerjasc
+nmnjcviburili,<>
+zxcvnadtruqe
+ertrjiloifdfyyoiu
+grty iuewdiivjh