update old textgen (now textgen_markov)
This commit is contained in:
82
textgen_markov.py
Normal file
82
textgen_markov.py
Normal file
@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import re
|
||||
import random
|
||||
from textgen import textgen
|
||||
|
||||
from rich.traceback import install
|
||||
install()
|
||||
|
||||
# NOTE: This is word based, not character based
|
||||
# TODO: Serialize and save/load model (don't train on the server)
|
||||
# TODO: Maybe extract sentence beginnings and use them as starters?
|
||||
|
||||
class MarkovTextGenerator(textgen):
|
||||
# The greater the order (prefix length), the lesser the variation in generation, but the better the sentences (generally).
|
||||
# If the prefix length is high there are less options to choose from, so the sentences are very close to the training text.
|
||||
def __init__(self, order): # Set order here for better interface (only needed for markov model)
|
||||
self.order = order
|
||||
|
||||
def init(self, filename): # Filename is needed for every type of model so it's part of the interface
|
||||
with open(f"./textfiles/{filename}.txt") as file:
|
||||
# Remove all characters except a-zäöüß'.,
|
||||
self.wordbase = re.sub(r"[^a-zäöüß'.,]+", " ", file.read().lower()).split()
|
||||
|
||||
self.word_table = dict()
|
||||
|
||||
def load(self):
|
||||
print(f"Loaded Markov chain of order {self.order} with {len(self.wordbase)} words from file.")
|
||||
|
||||
def train(self):
|
||||
print(f"Training Markov chain of order {self.order} with {len(self.wordbase)} words.")
|
||||
|
||||
# init the frequencies
|
||||
for i in range(len(self.wordbase) - self.order - 1): # Look at every word in range
|
||||
prefix = tuple(self.wordbase[i:i+self.order]) # Look at the next self.order words from current position
|
||||
suffix = self.wordbase[i+self.order] # The next word is the suffix
|
||||
|
||||
if prefix not in self.word_table: # New option wooo
|
||||
self.word_table[prefix] = []
|
||||
|
||||
# if suffix not in self.table[prefix]: # disable for probabilities: if the suffixes are in the list multiple times they are more common
|
||||
self.word_table[prefix].append(suffix)
|
||||
|
||||
print(f"Generated suffixes for {len(self.word_table)} prefixes.")
|
||||
|
||||
# def generate_random(self, n):
|
||||
# fword = random.choice(list(self.word_table.keys())) # Random first word
|
||||
# output = [*fword]
|
||||
|
||||
# for _ in range(self.order, n):
|
||||
# output.append(self.generate_word_by_word(tuple(output[-self.order :])))
|
||||
|
||||
# return output
|
||||
|
||||
def generate_suffix_for_prefix(self, prefix: tuple):
|
||||
if len(prefix) > self.order: # In this case we look at the last self.order elements of prefix
|
||||
prefix = prefix[len(prefix)-self.order-1:-1]
|
||||
|
||||
if prefix not in self.word_table: # In this case we need to choose a possible suffix from the last word in the prefix (if prefix too short for example)
|
||||
print(f"Prefix {prefix} not in table")
|
||||
for key in self.word_table.keys():
|
||||
if key[-1] == prefix[-1]:
|
||||
return random.choice(self.word_table[key])
|
||||
|
||||
return random.choice(self.word_table[prefix])
|
||||
|
||||
def generate_sentence(self):
|
||||
fword = random.choice(list(self.word_table.keys()))
|
||||
output = [*fword]
|
||||
|
||||
while "." not in output[-1]:
|
||||
output.append(self.generate_suffix_for_prefix(tuple(output[-self.order:])))
|
||||
|
||||
return output
|
||||
|
||||
def complete_sentence(self, prefix):
|
||||
output = [*prefix]
|
||||
|
||||
while "." not in output[-1]:
|
||||
output.append(self.generate_suffix_for_prefix(tuple(output[-self.order:])))
|
||||
|
||||
return output
|
Reference in New Issue
Block a user