Here is a fun little class I threw together this week while finishing up with Think Python.
It scans text files from Project Gutenberg to generate a dictionary that maps each word (and sets of two consecutive words) in the book to the word immediately following it. It then uses this dictionary to generate a random string of text based on any initial word you give it. I apologize for the horrible lack of commenting.
import string
import random
import os
class TextGenerator():
def __init__(self):
self.markov = dict()
def __str__(self):
return str(self.markov)
def add(self, file):
self.markov_analysis(self.process_file(file))
def generate_text(self, first_word='the', num=10):
count = 0
text = first_word
word = first_word
prev = None
while count < num:
if prev is not None and (prev, word) in self.markov:
word = self.random_word(self.markov[(prev, word)])
# print 'double: %s' % word
else:
word = self.random_word(self.markov[word])
# print 'single: %s' % word
# print '(%s, %s)' % (prev, word)
text += ' ' + word
count += 1
prev = word
return text
def process_file(self, file):
""" Processesses a plaintext file from project gutenberg """
result = []
gutenberg_header = True
fin = open(file)
for line in fin:
if not gutenberg_header: # check if we're past header
words = line.split()
for i in range(len(words)):
bad_chars = string.punctuation + string.whitespace
words[i] = words[i].lower().translate(string.maketrans("",""), bad_chars)
result.extend(words)
elif line.count('***') == 2: # check if we're at end of header
gutenberg_header = False
return result
def markov_analysis(self, word_list):
prev = None
second_prev = None
for word in word_list:
if prev is not None:
if second_prev is not None:
self.add_markov((second_prev, prev), word)
self.add_markov(prev, word)
second_prev = prev
prev = word
def add_markov(self, key, value):
if key not in self.markov:
self.markov[key] = dict()
self.markov[key][value] = self.markov[key].get(value, 0) + 1
def random_word(self, h):
t = []
for word, freq in h.items():
t.extend([word] * freq)
return random.choice(t)
def add_directory(self, directory):
for name in os.listdir(directory):
if os.path.isfile(directory+name):
print 'Scanning:', directory+name
self.add(directory+name)
def main():
myText = TextGenerator()
myText.add_directory('books/')
print myText.generate_text('the')
if __name__ == '__main__':
main()</pre>
Tags: python