From 1712a7db53ad6dc6b9f405b3a7a7160a9e6f22fe Mon Sep 17 00:00:00 2001 From: "Brian S. Stephan" Date: Thu, 24 Feb 2011 20:39:32 -0600 Subject: [PATCH] Markov: use sqlite backend for brain this keeps us from having the entire markov chain in memory and having to do the pickling and so on. in many ways, this is a good thing. in one way, this is a bad thing. each line on irc will create a __start1,__start2 item in the database, which means starting a chain will be an expensive process. (approx 3 seconds, from irc logs of 600,000 K lines). following selects run much faster, but the first one is dog slow. a later commit should hopefully fix this. --- modules/Markov.py | 104 +++++++++++++++++++++++++++++++--------------- 1 file changed, 71 insertions(+), 33 deletions(-) diff --git a/modules/Markov.py b/modules/Markov.py index 8f5800a..5ab03ac 100644 --- a/modules/Markov.py +++ b/modules/Markov.py @@ -20,6 +20,7 @@ import cPickle import os import random import re +import sqlite3 import sys from extlib import irclib @@ -41,10 +42,6 @@ class Markov(Module): def __init__(self, irc, config, server): """Create the Markov chainer, and learn text from a file if available.""" - Module.__init__(self, irc, config, server) - - self.brain_filename = 'dr.botzo.markov' - # set up some keywords for use in the chains --- don't change these # once you've created a brain self.start1 = '__start1' @@ -60,13 +57,33 @@ class Markov(Module): self.learnre = re.compile(learnpattern) self.replyre = re.compile(replypattern) - try: - brainfile = open(self.brain_filename, 'r') - self.brain = cPickle.load(brainfile) - brainfile.close() - except IOError: - self.brain = {} - self.brain.setdefault((self.start1, self.start2), []).append(self.stop) + Module.__init__(self, irc, config, server) + + def db_init(self): + """Create the markov chain table.""" + + version = self.db_module_registered(self.__class__.__name__) + if (version == None): + db = self.get_db() + try: + db.execute(''' + CREATE TABLE markov_chain ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + k1 TEXT NOT NULL, + k2 TEXT NOT NULL, + v TEXT NOT NULL + )''') + db.execute('CREATE INDEX markov_chain_key_index ON markov_chain (k1, k2)') + sql = 'INSERT INTO drbotzo_modules VALUES (?,?)' + db.execute(sql, (self.__class__.__name__, 1)) + db.commit() + version = 1 + + self._learn_line('') + except sqlite3.Error as e: + db.rollback() + print("sqlite error: " + str(e)) + raise def register_handlers(self): """Handle pubmsg/privmsg, to learn and/or reply to IRC events.""" @@ -82,13 +99,6 @@ class Markov(Module): self.server.remove_global_handler('pubmsg', self.learn_from_irc_event) self.server.remove_global_handler('privmsg', self.learn_from_irc_event) - def save(self): - """Pickle the brain upon save.""" - - brainfile = open(self.brain_filename, 'w') - cPickle.dump(self.brain, brainfile) - brainfile.close() - def learn_from_irc_event(self, connection, event): """Learn from IRC events.""" @@ -170,16 +180,24 @@ class Markov(Module): """Create Markov chains from the provided line.""" # set up the head of the chain - w1 = self.start1 - w2 = self.start2 + k1 = self.start1 + k2 = self.start2 - # for each word pair, add the next word to the dictionary - for word in line.split(): - self.brain.setdefault((w1, w2), []).append(word.lower()) - w1, w2 = w2, word.lower() + try: + db = self.get_db() + cur = db.cursor() + statement = 'INSERT INTO markov_chain (k1, k2, v) VALUES (?, ?, ?)' - # cap the end of the chain - self.brain.setdefault((w1, w2), []).append(self.stop) + for word in line.split(): + cur.execute(statement, (k1.decode('utf-8', 'replace').lower(), k2.decode('utf-8', 'replace').lower(), word.decode('utf-8', 'replace').lower())) + k1, k2 = k2, word + cur.execute(statement, (k1.decode('utf-8', 'replace').lower(), k2.decode('utf-8', 'replace').lower(), self.stop)) + + db.commit() + except sqlite3.Error as e: + db.rollback() + print("sqlite error: " + str(e)) + raise def _reply(self, min_size=15, max_size=100): """Generate a totally random string from the chains, of specified limit of words.""" @@ -200,10 +218,11 @@ class Markov(Module): # walk a chain, randomly, building the list of words while len(gen_words) < max_size + 2 and gen_words[-1] != self.stop: - if len(gen_words) < min_size and len(filter(lambda a: a != self.stop, self.brain[(gen_words[-2], gen_words[-1])])) > 0: + key_hits = self._retrieve_chains_for_key(gen_words[-2], gen_words[-1]) + if len(gen_words) < min_size and len(filter(lambda a: a != self.stop, key_hits)) > 0: # we aren't at min size yet and we have at least one chain path # that isn't (yet) the end. take one of those. - gen_words.append(random.choice(filter(lambda a: a != self.stop, self.brain[(gen_words[-2], gen_words[-1])]))) + gen_words.append(random.choice(filter(lambda a: a != self.stop, key_hits))) min_search_tries = 0 elif len(gen_words) < min_size and min_search_tries <= 10: # we aren't at min size yet and the only path we currently have is @@ -215,7 +234,7 @@ class Markov(Module): # either we have hit our min size requirement, or we haven't but # we also exhausted min_search_tries. either way, just pick a word # at random, knowing it may be the end of the chain - gen_words.append(random.choice(self.brain[(gen_words[-2], gen_words[-1])])) + gen_words.append(random.choice(key_hits)) min_search_tries = 0 # chop off the seed data at the start @@ -247,16 +266,17 @@ class Markov(Module): # walk a chain, randomly, building the list of words while len(gen_words) < max_size + 2 and gen_words[-1] != self.stop: + key_hits = self._retrieve_chains_for_key(gen_words[-2], gen_words[-1]) # use the chain that includes the target word, if it is found - if target_word in self.brain[(gen_words[-2], gen_words[-1])]: + if target_word in key_hits: gen_words.append(target_word) # generate new word target_word = words[random.randint(0, len(words)-1)] else: - if len(gen_words) < min_size and len(filter(lambda a: a != self.stop, self.brain[(gen_words[-2], gen_words[-1])])) > 0: - gen_words.append(random.choice(filter(lambda a: a != self.stop, self.brain[(gen_words[-2], gen_words[-1])]))) + if len(gen_words) < min_size and len(filter(lambda a: a != self.stop, key_hits)) > 0: + gen_words.append(random.choice(filter(lambda a: a != self.stop, key_hits))) else: - gen_words.append(random.choice(self.brain[(gen_words[-2], gen_words[-1])])) + gen_words.append(random.choice(key_hits)) # chop off the seed data at the start gen_words = gen_words[2:] @@ -267,5 +287,23 @@ class Markov(Module): return ' '.join(gen_words) + def _retrieve_chains_for_key(self, k1, k2): + """Get the value(s) for a given key (a pair of strings).""" + + values = [] + try: + db = self.get_db() + query = 'SELECT v FROM markov_chain WHERE k1 = ? AND k2 = ?' + cursor = db.execute(query, (k1,k2)) + results = cursor.fetchall() + + for result in results: + values.append(result['v']) + + return values + except sqlite3.Error as e: + print('sqlite error: ' + str(e)) + raise + # vi:tabstop=4:expandtab:autoindent # kate: indent-mode python;indent-width 4;replace-tabs on;