Add automatic checking for profanity

Dunedan · Dunedan · commit 1ed99e3fa880 · 2024-10-08T08:19:43.000+02:00
This adds functionality to automatically check for profanity in text
messages written in any of the XMPP MUC rooms monitored by the
moderation bot.

The terms being considered profanity can be configured using the
database and are language specific. They have to be stored in
their lemmatized form. If a supported language gets detected with an
accuracy of 100% only terms for that language will be checked, otherwise
English terms will be checked as well. Supported languages for now are
English, French, German, Polish, Portuguese, Russian, Spanish and
Turkish.

For the first two times in a sliding window of three months a user uses
profanity they'll receive a warning. Starting from the third time,
the user will get muted. At first users will be muted for five minutes,
with an exponentially increasing duration up to one week for each
continued use of profanity afterwards.

To enable this functionality the `--enable-profanity-monitoring`
command line option has to be provided.
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,6 +20,7 @@ dependencies = [
     "cachetools",
     "defusedxml",
     "dateparser",
+    "simplemma[marisa-trie]>=1.1.1",
     "slixmpp>=1.8.0",
     "sqlalchemy>=2.0.4",
 ]
@@ -87,5 +88,5 @@ max-doc-length = 72
 convention = "pep257"
 
 [tool.ruff.lint.pylint]
-max-args = 8
+max-args = 10
 max-nested-blocks = 4
diff --git a/xpartamupp/lobby_moderation_db.py b/xpartamupp/lobby_moderation_db.py
@@ -24,6 +24,7 @@
 from typing import Any, ClassVar
 
 from sqlalchemy import (
+    JSON,
     DateTime,
     ForeignKey,
     String,
@@ -69,20 +70,13 @@ class Base(DeclarativeBase):
     }
 
 
-class Blacklist(Base):
+class ProfanityTerms(Base):
     """Model for profanity terms."""
 
-    __tablename__ = "profanity_blacklist"
+    __tablename__ = "profanity_terms"
 
-    word: Mapped[str] = mapped_column(String(255), primary_key=True)
-
-
-class Whitelist(Base):
-    """Model for terms which are whitelisted from profanity."""
-
-    __tablename__ = "profanity_whitelist"
-
-    word: Mapped[str] = mapped_column(String(255), primary_key=True)
+    term: Mapped[str] = mapped_column(String(255), primary_key=True)
+    language: Mapped[str] = mapped_column(String(2), primary_key=True)
 
 
 class ProfanityIncident(Base):
@@ -91,10 +85,12 @@ class ProfanityIncident(Base):
     __tablename__ = "profanity_incidents"
 
     id: Mapped[int] = mapped_column(primary_key=True)
-    timestamp: Mapped[datetime]
+    timestamp: Mapped[datetime] = mapped_column(default=partial(datetime.now, tz=UTC))
     player: Mapped[str] = mapped_column(String(255))
+    room: Mapped[str] = mapped_column(String(255))
     offending_content: Mapped[str] = mapped_column(UnicodeText)
-    deleted: Mapped[bool]
+    detected_languages: Mapped[list[str]] = mapped_column(JSON)
+    matched_terms: Mapped[list[str]] = mapped_column(JSON)
 
 
 class JIDNickWhitelist(Base):
diff --git a/xpartamupp/modbot.py b/xpartamupp/modbot.py