From 9e44532253c737dfa3157c85d591558f539d9038 Mon Sep 17 00:00:00 2001 From: ChUrl Date: Tue, 8 Nov 2022 20:45:41 +0100 Subject: [PATCH] update old textgen (now textgen_markov) --- textgen_markov.py | 82 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 textgen_markov.py diff --git a/textgen_markov.py b/textgen_markov.py new file mode 100644 index 0000000..3bbfd2f --- /dev/null +++ b/textgen_markov.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +import re +import random +from textgen import textgen + +from rich.traceback import install +install() + +# NOTE: This is word based, not character based +# TODO: Serialize and save/load model (don't train on the server) +# TODO: Maybe extract sentence beginnings and use them as starters? + +class MarkovTextGenerator(textgen): + # The greater the order (prefix length), the lesser the variation in generation, but the better the sentences (generally). + # If the prefix length is high there are less options to choose from, so the sentences are very close to the training text. + def __init__(self, order): # Set order here for better interface (only needed for markov model) + self.order = order + + def init(self, filename): # Filename is needed for every type of model so it's part of the interface + with open(f"./textfiles/{filename}.txt") as file: + # Remove all characters except a-zäöüß'., + self.wordbase = re.sub(r"[^a-zäöüß'.,]+", " ", file.read().lower()).split() + + self.word_table = dict() + + def load(self): + print(f"Loaded Markov chain of order {self.order} with {len(self.wordbase)} words from file.") + + def train(self): + print(f"Training Markov chain of order {self.order} with {len(self.wordbase)} words.") + + # init the frequencies + for i in range(len(self.wordbase) - self.order - 1): # Look at every word in range + prefix = tuple(self.wordbase[i:i+self.order]) # Look at the next self.order words from current position + suffix = self.wordbase[i+self.order] # The next word is the suffix + + if prefix not in self.word_table: # New option wooo + self.word_table[prefix] = [] + + # if suffix not in self.table[prefix]: # disable for probabilities: if the suffixes are in the list multiple times they are more common + self.word_table[prefix].append(suffix) + + print(f"Generated suffixes for {len(self.word_table)} prefixes.") + + # def generate_random(self, n): + # fword = random.choice(list(self.word_table.keys())) # Random first word + # output = [*fword] + + # for _ in range(self.order, n): + # output.append(self.generate_word_by_word(tuple(output[-self.order :]))) + + # return output + + def generate_suffix_for_prefix(self, prefix: tuple): + if len(prefix) > self.order: # In this case we look at the last self.order elements of prefix + prefix = prefix[len(prefix)-self.order-1:-1] + + if prefix not in self.word_table: # In this case we need to choose a possible suffix from the last word in the prefix (if prefix too short for example) + print(f"Prefix {prefix} not in table") + for key in self.word_table.keys(): + if key[-1] == prefix[-1]: + return random.choice(self.word_table[key]) + + return random.choice(self.word_table[prefix]) + + def generate_sentence(self): + fword = random.choice(list(self.word_table.keys())) + output = [*fword] + + while "." not in output[-1]: + output.append(self.generate_suffix_for_prefix(tuple(output[-self.order:]))) + + return output + + def complete_sentence(self, prefix): + output = [*prefix] + + while "." not in output[-1]: + output.append(self.generate_suffix_for_prefix(tuple(output[-self.order:]))) + + return output