update old textgen (now textgen_markov)

2022-11-08 20:45:41 +01:00
parent 37dec1dd0b
commit 9e44532253
1 changed files with 82 additions and 0 deletions
--- a/textgen_markov.py
+++ b/textgen_markov.py
@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+
+import re
+import random
+from textgen import textgen
+
+from rich.traceback import install
+install()
+
+# NOTE: This is word based, not character based
+# TODO: Serialize and save/load model (don't train on the server)
+# TODO: Maybe extract sentence beginnings and use them as starters?
+
+class MarkovTextGenerator(textgen):
+    # The greater the order (prefix length), the lesser the variation in generation, but the better the sentences (generally).
+    # If the prefix length is high there are less options to choose from, so the sentences are very close to the training text.
+    def __init__(self, order): # Set order here for better interface (only needed for markov model)
+        self.order = order
+
+    def init(self, filename): # Filename is needed for every type of model so it's part of the interface
+        with open(f"./textfiles/{filename}.txt") as file:
+            # Remove all characters except a-zäöüß'.,
+            self.wordbase = re.sub(r"[^a-zäöüß'.,]+", " ", file.read().lower()).split()
+
+        self.word_table = dict()
+
+    def load(self):
+        print(f"Loaded Markov chain of order {self.order} with {len(self.wordbase)} words from file.")
+
+    def train(self):
+        print(f"Training Markov chain of order {self.order} with {len(self.wordbase)} words.")
+
+        # init the frequencies
+        for i in range(len(self.wordbase) - self.order - 1): # Look at every word in range
+            prefix = tuple(self.wordbase[i:i+self.order]) # Look at the next self.order words from current position
+            suffix = self.wordbase[i+self.order] # The next word is the suffix
+
+            if prefix not in self.word_table: # New option wooo
+                self.word_table[prefix] = []
+
+            # if suffix not in self.table[prefix]: # disable for probabilities: if the suffixes are in the list multiple times they are more common
+            self.word_table[prefix].append(suffix)
+
+        print(f"Generated suffixes for {len(self.word_table)} prefixes.")
+
+    # def generate_random(self, n):
+    #     fword = random.choice(list(self.word_table.keys())) # Random first word
+    #     output = [*fword]
+
+    #     for _ in range(self.order, n):
+    #         output.append(self.generate_word_by_word(tuple(output[-self.order :])))
+
+    #     return output
+
+    def generate_suffix_for_prefix(self, prefix: tuple):
+        if len(prefix) > self.order: # In this case we look at the last self.order elements of prefix
+            prefix = prefix[len(prefix)-self.order-1:-1]
+
+        if prefix not in self.word_table: # In this case we need to choose a possible suffix from the last word in the prefix (if prefix too short for example)
+            print(f"Prefix {prefix} not in table")
+            for key in self.word_table.keys():
+                if key[-1] == prefix[-1]:
+                    return random.choice(self.word_table[key])
+
+        return random.choice(self.word_table[prefix])
+
+    def generate_sentence(self):
+        fword = random.choice(list(self.word_table.keys()))
+        output = [*fword]
+
+        while "." not in output[-1]:
+            output.append(self.generate_suffix_for_prefix(tuple(output[-self.order:])))
+
+        return output
+
+    def complete_sentence(self, prefix):
+        output = [*prefix]
+
+        while "." not in output[-1]:
+            output.append(self.generate_suffix_for_prefix(tuple(output[-self.order:])))
+
+        return output