Markov: a module to implement a chatterbot via markov chains.

yeah, we have MegaHAL, but i can't find a good implementation in python that actually works and is stable, so we'll implement a simple thing ourselves. works pretty much like MegaHAL does, but without the string corruption. original code provided by ape, care of mike bloy
2011-01-18 22:30:59 -06:00 · 2011-01-18 22:30:59 -06:00 · 8dd223f778
commit 8dd223f778
parent 8f86b7484a
1 changed files with 229 additions and 0 deletions
--- a/modules/Markov.py
+++ b/modules/Markov.py
@ -0,0 +1,229 @@
+"""
+Markov - Chatterbot via Markov chains for IRC
+Copyright (C) 2010  Brian S. Stephan
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import cPickle
+import os
+import random
+import re
+import sys
+
+from extlib import irclib
+
+from Module import Module
+
+class Markov(Module):
+
+    """
+    Create a chatterbot very similar to a MegaHAL, but simpler and
+    implemented in pure Python. Proof of concept code from Ape.
+    
+    Ape wrote: based on this:
+    http://uswaretech.com/blog/2009/06/pseudo-random-text-markov-chains-python/
+    and this:
+    http://code.activestate.com/recipes/194364-the-markov-chain-algorithm/
+    """
+    
+    def __init__(self, irc, config, server):
+        """Create the Markov chainer, and learn text from a file if available."""
+
+        Module.__init__(self, irc, config, server)
+
+        self.brain_filename = 'dr.botzo.markov'
+
+        # set up some keywords for use in the chains --- don't change these
+        # once you've created a brain
+        self.start1 = '__start1'
+        self.start2 = '__start2'
+        self.stop = '__stop'
+
+        # set up regexes, for replying to specific stuff
+        trainpattern = '!markov\s+train\s+(.*)$'
+        learnpattern = '!markov\s+learn\s+(.*)$'
+        replypattern = '!markov\s+reply(\s+(.*)$|$)'
+
+        self.trainre = re.compile(trainpattern)
+        self.learnre = re.compile(learnpattern)
+        self.replyre = re.compile(replypattern)
+
+        try:
+            brainfile = open(self.brain_filename, 'r')
+            self.brain = cPickle.load(brainfile)
+            brainfile.close()
+        except IOError:
+            self.brain = {}
+            self.brain.setdefault((self.start1, self.start2), []).append(self.stop)
+
+    def register_handlers(self, server):
+        """Handle pubmsg/privmsg, to learn and/or reply to IRC events."""
+
+        self.server.add_global_handler('pubmsg', self.learn_from_irc_event)
+        self.server.add_global_handler('privmsg', self.learn_from_irc_event)
+
+    def unregister_handlers(self):
+        self.server.remove_global_handler('pubmsg', self.learn_from_irc_event)
+        self.server.remove_global_handler('privmsg', self.learn_from_irc_event)
+
+    def save(self):
+        """Pickle the brain upon save."""
+
+        brainfile = open(self.brain_filename, 'w')
+        cPickle.dump(self.brain, brainfile)
+        brainfile.close()
+
+    def learn_from_irc_event(self, connection, event):
+        """Learn from IRC events."""
+
+        what = ''.join(event.arguments()[0])
+
+        # don't learn from commands
+        if self.trainre.search(what) or self.learnre.search(what) or self.replyre.search(what):
+            return
+
+        self._learn_line(what)
+
+    def do(self, connection, event, nick, userhost, what, admin_unlocked):
+        """Handle commands and inputs."""
+
+        if self.trainre.search(what):
+            return self.markov_train(connection, event, nick, userhost, what, admin_unlocked)
+        elif self.learnre.search(what):
+            return self.markov_learn(connection, event, nick, userhost, what, admin_unlocked)
+        elif self.replyre.search(what):
+            return self.markov_reply(connection, event, nick, userhost, what, admin_unlocked)
+
+        # not a command, so see if i'm being mentioned
+        if re.search(connection.get_nickname(), what, re.IGNORECASE) is not None:
+            addressed_pattern = '^' + connection.get_nickname() + '[:,]\s+(.*)'
+            addressed_re = re.compile(addressed_pattern)
+            if addressed_re.match(what):
+                # i was addressed directly, so respond, addressing the speaker
+                return '{0:s}: {1:s}'.format(nick, self._reply_to_line(addressed_re.match(what).group(1)))
+            else:
+                # i wasn't addressed directly, so just respond
+                return '{0:s}'.format(self._reply_to_line(what))
+
+    def markov_train(self, connection, event, nick, userhost, what, admin_unlocked):
+        """Learn lines from a file. Good for initializing a brain."""
+
+        match = self.trainre.search(what)
+        if match and admin_unlocked:
+            filename = match.group(1)
+
+            try:
+                for line in open(filename, 'r'):
+                    self._learn_line(line)
+
+                return 'Learned from \'{0:s}\'.'.format(filename)
+            except IOError:
+                return 'No such file \'{0:s}\'.'.format(filename)
+
+    def markov_learn(self, connection, event, nick, userhost, what, admin_unlocked):
+        """Learn one line, as provided to the command."""
+
+        match = self.learnre.search(what)
+        if match:
+            line = match.group(1)
+            self._learn_line(line)
+
+    def markov_reply(self, connection, event, nick, userhost, what, admin_unlocked):
+        """Generate a reply to one line, without learning it."""
+
+        match = self.replyre.search(what)
+        if match:
+            if match.group(2):
+                line = match.group(2)
+                return self._reply_to_line(line)
+            else:
+                return self._reply()
+
+    def _learn_line(self, line):
+        """Create Markov chains from the provided line."""
+
+        # set up the head of the chain
+        w1 = self.start1
+        w2 = self.start2
+
+        # for each word pair, add the next word to the dictionary
+        for word in line.split():
+            self.brain.setdefault((w1, w2), []).append(word.lower())
+            w1, w2 = w2, word.lower()
+
+        # cap the end of the chain
+        self.brain.setdefault((w1, w2), []).append(self.stop)
+
+    def _reply(self, size=25):
+        """Generate a totally random string from the chains, of specified limit of words."""
+
+        # if the limit is too low, there's nothing to do
+        if (size <= 3):
+            raise Exception("size is too small: %d" % size)
+
+        # start with an empty chain, and work from there
+        gen_words = [self.start1, self.start2]
+
+        # walk a chain, randomly, building the list of words
+        while len(gen_words) < size + 2 and gen_words[-1] != self.stop:
+            gen_words.append(random.choice(self.brain[(gen_words[-2], gen_words[-1])]))
+
+        # chop off the seed data at the start
+        gen_words = gen_words[2:]
+
+        # chop off the end text, if it was the keyword indicating an end of chain
+        if gen_words[-1] == self.stop:
+            gen_words = gen_words[:-1]
+
+        return ' '.join(gen_words)
+
+    def _reply_to_line(self, line, size=25):
+        """Reply to a line, using some text in the line as a point in the chain."""
+
+        # if the limit is too low, there's nothing to do
+        if (size <= 3):
+            raise Exception("size is too small: %d" % size)
+
+        # get a random word from the input
+        words = line.split()
+        target_word = words[random.randint(0, len(words)-1)]
+        print('trying ' + target_word)
+
+        # start with an empty chain, and work from there
+        gen_words = [self.start1, self.start2]
+
+        # walk a chain, randomly, building the list of words
+        while len(gen_words) < size + 2 and gen_words[-1] != self.stop:
+            # use the chain that includes the target word, if it is found
+            if target_word in self.brain[(gen_words[-2], gen_words[-1])]:
+                print('found ' + target_word)
+                gen_words.append(target_word)
+                # generate new word
+                target_word = words[random.randint(0, len(words)-1)]
+                print('trying ' + target_word)
+            else:
+                gen_words.append(random.choice(self.brain[(gen_words[-2], gen_words[-1])]))
+
+        # chop off the seed data at the start
+        gen_words = gen_words[2:]
+
+        # chop off the end text, if it was the keyword indicating an end of chain
+        if gen_words[-1] == self.stop:
+            gen_words = gen_words[:-1]
+
+        return ' '.join(gen_words)
+
+# vi:tabstop=4:expandtab:autoindent
+# kate: indent-mode python;indent-width 4;replace-tabs on;