update old textgen (now textgen_markov)

This commit is contained in:
2022-11-08 20:45:41 +01:00
parent 37dec1dd0b
commit 9e44532253

82
textgen_markov.py Normal file
View File

@ -0,0 +1,82 @@
#!/usr/bin/env python3
import re
import random
from textgen import textgen
from rich.traceback import install
install()
# NOTE: This is word based, not character based
# TODO: Serialize and save/load model (don't train on the server)
# TODO: Maybe extract sentence beginnings and use them as starters?
class MarkovTextGenerator(textgen):
# The greater the order (prefix length), the lesser the variation in generation, but the better the sentences (generally).
# If the prefix length is high there are less options to choose from, so the sentences are very close to the training text.
def __init__(self, order): # Set order here for better interface (only needed for markov model)
self.order = order
def init(self, filename): # Filename is needed for every type of model so it's part of the interface
with open(f"./textfiles/{filename}.txt") as file:
# Remove all characters except a-zäöüß'.,
self.wordbase = re.sub(r"[^a-zäöüß'.,]+", " ", file.read().lower()).split()
self.word_table = dict()
def load(self):
print(f"Loaded Markov chain of order {self.order} with {len(self.wordbase)} words from file.")
def train(self):
print(f"Training Markov chain of order {self.order} with {len(self.wordbase)} words.")
# init the frequencies
for i in range(len(self.wordbase) - self.order - 1): # Look at every word in range
prefix = tuple(self.wordbase[i:i+self.order]) # Look at the next self.order words from current position
suffix = self.wordbase[i+self.order] # The next word is the suffix
if prefix not in self.word_table: # New option wooo
self.word_table[prefix] = []
# if suffix not in self.table[prefix]: # disable for probabilities: if the suffixes are in the list multiple times they are more common
self.word_table[prefix].append(suffix)
print(f"Generated suffixes for {len(self.word_table)} prefixes.")
# def generate_random(self, n):
# fword = random.choice(list(self.word_table.keys())) # Random first word
# output = [*fword]
# for _ in range(self.order, n):
# output.append(self.generate_word_by_word(tuple(output[-self.order :])))
# return output
def generate_suffix_for_prefix(self, prefix: tuple):
if len(prefix) > self.order: # In this case we look at the last self.order elements of prefix
prefix = prefix[len(prefix)-self.order-1:-1]
if prefix not in self.word_table: # In this case we need to choose a possible suffix from the last word in the prefix (if prefix too short for example)
print(f"Prefix {prefix} not in table")
for key in self.word_table.keys():
if key[-1] == prefix[-1]:
return random.choice(self.word_table[key])
return random.choice(self.word_table[prefix])
def generate_sentence(self):
fword = random.choice(list(self.word_table.keys()))
output = [*fword]
while "." not in output[-1]:
output.append(self.generate_suffix_for_prefix(tuple(output[-self.order:])))
return output
def complete_sentence(self, prefix):
output = [*prefix]
while "." not in output[-1]:
output.append(self.generate_suffix_for_prefix(tuple(output[-self.order:])))
return output