From 9732ed2f34430c4151466b174e92ad6af77efa4d Mon Sep 17 00:00:00 2001 From: "Brian S. Stephan" Date: Thu, 6 Jan 2011 00:14:16 -0600 Subject: [PATCH] use a different megahal implementation, which seems to be better at saving the brain. note that nothing megahal is shipped with the bot now, including train files. this bot will expect a ./megahal.trn, and will write to ./megahal.brn (in addition to a slew of other ./megahal.* files). implementation is from http://megahal.alioth.debian.org/ --- .gitignore | 1 + extlib/megahal.py | 472 --------------------------------------------- modules/MegaHAL.py | 19 +- 3 files changed, 9 insertions(+), 483 deletions(-) delete mode 100644 extlib/megahal.py diff --git a/.gitignore b/.gitignore index 607a91c..560d297 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ dr.botzo.cfg nbproject parser.out parsetab.py +megahal.* diff --git a/extlib/megahal.py b/extlib/megahal.py deleted file mode 100644 index a4a0df6..0000000 --- a/extlib/megahal.py +++ /dev/null @@ -1,472 +0,0 @@ -# Copyright (c) 2010, Chris Jones -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# - Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# - Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -"""Python implementation of megahal markov bot""" - -from time import time -import shelve -import random -import math -import os - -__version__ = '0.2' -__author__ = 'Chris Jones ' -__license__ = 'BSD' -__all__ = ['MegaHAL', 'Dictionary', 'Tree', '__version__', 'DEFAULT_ORDER', 'DEFAULT_BRAINFILE', 'DEFAULT_TIMEOUT'] - -DEFAULT_ORDER = 5 -DEFAULT_BRAINFILE = os.path.join(os.environ.get('HOME', ''), '.pymegahal-brain') -DEFAULT_TIMEOUT = 1.0 - -API_VERSION = '1.0' -END_WORD = '' -ERROR_WORD = '' - -DEFAULT_BANWORDS = ['A', 'ABILITY', 'ABLE', 'ABOUT', 'ABSOLUTE', 'ABSOLUTELY', 'ACROSS', 'ACTUAL', 'ACTUALLY', 'AFTER', - 'AGAIN', 'AGAINST', 'AGO', 'AGREE', 'ALL', 'ALMOST', 'ALONG', 'ALREADY', 'ALTHOUGH', 'ALWAYS', - 'AN', 'AND', 'ANOTHER', 'ANY', 'ANYHOW', 'ANYTHING', 'ANYWAY', 'ARE', "AREN'T", 'AROUND', 'AS', - 'AWAY', 'BACK', 'BAD', 'BE', 'BEEN', 'BEFORE', 'BEHIND', 'BEING', 'BELIEVE', 'BELONG', 'BEST', - 'BETWEEN', 'BIG', 'BIGGER', 'BIGGEST', 'BIT', 'BOTH', 'BUDDY', 'BUT', 'BY', 'CALL', 'CALLED', - 'CAME', 'CAN', "CAN'T", 'CANNOT', 'CARE', 'CARING', 'CASE', 'CATCH', 'CAUGHT', 'CERTAIN', - 'CHANGE', 'CLOSE', 'CLOSER', 'COME', 'COMING', 'COMMON', 'CONSTANT', 'CONSTANTLY', 'COULD', - 'DAY', 'DAYS', 'DERIVED', 'DESCRIBE', 'DESCRIBES', 'DETERMINE', 'DETERMINES', 'DID', "DIDN'T", - 'DOES', "DOESN'T", 'DOING', "DON'T", 'DONE', 'DOUBT', 'DOWN', 'EACH', 'EARLIER', 'EARLY', 'ELSE', - 'ESPECIALLY', 'EVEN', 'EVER', 'EVERY', 'EVERYBODY', 'EVERYONE', 'EVERYTHING', 'FACT', 'FAIR', - 'FAR', 'FELLOW', 'FEW', 'FIND', 'FINE', 'FOR', 'FORM', 'FOUND', 'FROM', 'FULL', 'FURTHER', 'GAVE', - 'GETTING', 'GIVE', 'GIVEN', 'GIVING', 'GO', 'GOING', 'GONE', 'GOOD', 'GOT', 'GOTTEN', 'GREAT', - 'HAS', "HASN'T", 'HAVE', "HAVEN'T", 'HAVING', 'HELD', 'HERE', 'HIGH', 'HOLD', 'HOLDING', 'HOW', - 'IN', 'INDEED', 'INSIDE', 'INSTEAD', 'INTO', 'IS', "ISN'T", 'IT', "IT'S", 'ITS', 'JUST', 'KEEP', - 'KNEW', 'KNOW', 'KNOWN', 'LARGE', 'LARGER', 'LARGETS', 'LAST', 'LATE', 'LATER', 'LEAST', 'LESS', - "LET'S", 'LEVEL', 'LIKES', 'LITTLE', 'LONG', 'LONGER', 'LOOK', 'LOOKED', 'LOOKING', 'LOOKS', 'LOW', - 'MAKE', 'MAKING', 'MANY', 'MATE', 'MAY', 'MAYBE', 'MEAN', 'MEET', 'MENTION', 'MERE', 'MIGHT', - 'MORE', 'MORNING', 'MOST', 'MOVE', 'MUCH', 'MUST', 'NEAR', 'NEARER', 'NEVER', 'NEXT', 'NICE', - 'NONE', 'NOON', 'NOONE', 'NOT', 'NOTE', 'NOTHING', 'NOW', 'OBVIOUS', 'OF', 'OFF', 'ON', 'ONCE', - 'ONTO', 'OPINION', 'OR', 'OTHER', 'OUR', 'OUT', 'OVER', 'OWN', 'PART', 'PARTICULAR', - 'PERHAPS', 'PERSON', 'PIECE', 'PLACE', 'PLEASANT', 'PLEASE', 'POPULAR', 'PREFER', 'PRETTY', 'PUT', - 'REAL', 'REALLY', 'RECEIVE', 'RECEIVED', 'RECENT', 'RECENTLY', 'RELATED', 'RESULT', 'RESULTING', - 'SAID', 'SAME', 'SAW', 'SAY', 'SAYING', 'SEE', 'SEEM', 'SEEMED', 'SEEMS', 'SEEN', 'SELDOM', - 'SET', 'SEVERAL', 'SHALL', 'SHORT', 'SHORTER', 'SHOULD', 'SHOW', 'SHOWS', 'SIMPLE', 'SIMPLY', - 'SO', 'SOME', 'SOMEONE', 'SOMETHING', 'SOMETIME', 'SOMETIMES', 'SOMEWHERE', 'SORT', 'SORTS', - 'SPENT', 'STILL', 'STUFF', 'SUCH', 'SUGGEST', 'SUGGESTION', 'SUPPOSE', 'SURE', 'SURELY', - 'SURROUNDS', 'TAKE', 'TAKEN', 'TAKING', 'TELL', 'THAN', 'THANK', 'THANKS', 'THAT', "THAT'S", - 'THE', 'THEIR', 'THEM', 'THEN', 'THERE', 'THEREFORE', 'THESE', 'THEY', 'THING', 'THINGS', 'THIS', - 'THOUGH', 'THOUGHTS', 'THOUROUGHLY', 'THROUGH', 'TINY', 'TO', 'TODAY', 'TOGETHER', 'TOLD', - 'TOO', 'TOTAL', 'TOTALLY', 'TOUCH', 'TRY', 'TWICE', 'UNDER', 'UNDERSTAND', 'UNDERSTOOD', 'UNTIL', - 'US', 'USED', 'USING', 'USUALLY', 'VARIOUS', 'VERY', 'WANT', 'WANTED', 'WANTS', 'WAS', 'WATCH', - 'WAYS', 'WE', "WE'RE", 'WELL', 'WENT', 'WERE', 'WHAT', "WHAT'S", 'WHATEVER', 'WHATS', 'WHEN', - "WHERE'S", 'WHICH', 'WHILE', 'WHILST', 'WHO', "WHO'S", 'WHOM', 'WILL', 'WISH', 'WITH', 'WITHIN', - 'WONDERFUL', 'WORSE', 'WORST', 'WOULD', 'WRONG', 'YESTERDAY', 'YET'] - -DEFAULT_AUXWORDS = ['DISLIKE', 'HE', 'HER', 'HERS', 'HIM', 'HIS', 'I', "I'D", "I'LL", "I'M", "I'VE", 'LIKE', 'ME', - 'MY', 'MYSELF', 'ONE', 'SHE', 'THREE', 'TWO', 'YOU', "YOU'D", "YOU'LL", "YOU'RE", "YOU'VE", 'YOUR', - 'YOURSELF'] - -DEFAULT_SWAPWORDS = {"YOU'RE": "I'M", "YOU'D": "I'D", 'HATE': 'LOVE', 'YOUR': 'MY', "I'LL": "YOU'LL", 'NO': 'YES', - 'WHY': 'BECAUSE', 'YOU': 'ME', 'LOVE': 'HATE', 'I': 'YOU', 'MINE': 'YOURS', 'YOURSELF': 'MYSELF', - 'DISLIKE': 'LIKE', "I'M": "YOU'RE", 'ME': 'YOU', 'MYSELF': 'YOURSELF', 'LIKE': 'DISLIKE', - "I'D": "YOU'D", "YOU'VE": "I'VE", 'YES': 'NO', 'MY': 'YOUR'} - -class Tree(object): - - def __init__(self, symbol=0): - self.symbol = symbol - self.usage = 0 - self.count = 0 - self.children = [] - - def add_symbol(self, symbol): - node = self.get_child(symbol) - node.count += 1 - self.usage += 1 - return node - - def get_child(self, symbol, add=True): - for child in self.children: - if child.symbol == symbol: - break - else: - if add: - child = Tree(symbol) - self.children.append(child) - else: - child = None - return child - - -class Dictionary(list): - - def add_word(self, word): - try: - return self.index(word) - except ValueError: - self.append(word) - return len(self) - 1 - - def find_word(self, word): - try: - return self.index(word) - except ValueError: - return 0 - - -class Brain(object): - - def __init__(self, order, file, timeout): - self.timeout = timeout - self.db = shelve.open(file, protocol=2, writeback=True) - if self.db.setdefault('api', API_VERSION) != API_VERSION: - raise ValueError('This brain has an incompatible api version: %d != %d' % (self.db['api'], API_VERSION)) - if self.db.setdefault('order', order) != order: - raise ValueError('This brain already has an order of %d' % self.db['order']) - self.forward = self.db.setdefault('forward', Tree()) - self.backward = self.db.setdefault('backward', Tree()) - self.dictionary = self.db.setdefault('dictionary', Dictionary()) - self.error_symbol = self.dictionary.add_word(ERROR_WORD) - self.end_symbol = self.dictionary.add_word(END_WORD) - self.banwords = self.db.setdefault('banwords', Dictionary(DEFAULT_BANWORDS)) - self.auxwords = self.db.setdefault('auxwords', Dictionary(DEFAULT_AUXWORDS)) - self.swapwords = self.db.setdefault('swapwords', DEFAULT_SWAPWORDS) - self.closed = False - - @property - def order(self): - return self.db['order'] - - @staticmethod - def get_words_from_phrase(phrase): - phrase = phrase.upper() - words = [] - if phrase: - offset = 0 - - def boundary(string, position): - if position == 0: - boundary = False - elif position == len(string): - boundary = True - elif (string[position] == "'" and - string[position - 1].isalpha() and - position + 1 == len(string)): - boundary = True - elif (string[position] == "'" and - string[position - 1].isalpha() and - string[position + 1].isalpha()): - boundary = False - elif (position > 1 and - string[position - 1] == "'" and - string[position - 2].isalpha() and - string[position].isalpha()): - boundary = False - elif (string[position].isalpha() and - not string[position - 1].isalpha()): - boundary = True - elif (not string[position].isalpha() and - string[position - 1].isalpha()): - boundary = True - elif string[position].isdigit() != string[position -1].isdigit(): - boundary = True - else: - boundary = False - return boundary - - while True: - if boundary(phrase, offset): - word, phrase = phrase[:offset], phrase[offset:] - words.append(word) - if not phrase: - break - offset = 0 - else: - offset += 1 - if words[-1][0].isalnum(): - words.append('.') - elif words[-1][-1] not in '!.?': - words[-1] = '.' - return words - - def communicate(self, phrase, learn=True, reply=True): - words = self.get_words_from_phrase(phrase) - if learn: - self.learn(words) - if reply: - return self.get_reply(words) - - def get_context(self, tree): - - class Context(dict): - - def __enter__(context): - context.used_key = False - context[0] = tree - return context - - def __exit__(context, *exc_info): - context.update(self.end_symbol) - - @property - def root(context): - return context[0] - - def update(context, symbol): - for i in xrange(self.order + 1, 0, -1): - node = context.get(i - 1) - if node is not None: - context[i] = node.add_symbol(symbol) - - def seed(context, keys): - if keys: - i = random.randrange(len(keys)) - for key in keys[i:] + keys[:i]: - if key not in self.auxwords: - try: - return self.dictionary.index(key) - except ValueError: - pass - if context.root.children: - return random.choice(context.root.children).symbol - return 0 - - def babble(context, keys, replies): - for i in xrange(self.order + 1): - if context.get(i) is not None: - node = context[i] - if not node.children: - return 0 - i = random.randrange(len(node.children)) - count = random.randrange(node.usage) - symbol = 0 - while count >= 0: - symbol = node.children[i].symbol - word = self.dictionary[symbol] - if word in keys and (context.used_key or word not in self.auxwords): - context.used_key = True - break - count -= node.children[i].count - if i >= len(node.children) - 1: - i = 0 - else: - i = i + 1 - return symbol - - return Context() - - def learn(self, words): - if len(words) > self.order: - with self.get_context(self.forward) as context: - for word in words: - context.update(self.dictionary.add_word(word)) - with self.get_context(self.backward) as context: - for word in reversed(words): - context.update(self.dictionary.index(word)) - - def get_reply(self, words): - keywords = self.make_keywords(words) - dummy_reply = self.generate_replywords() - if not dummy_reply or words == dummy_reply: - output = self.get_words_from_phrase("I don't know enough to answer yet!") - else: - output = dummy_reply - - max_surprise = -1.0 - basetime = time() - while time() - basetime < self.timeout: - reply = self.generate_replywords(keywords) - surprise = self.evaluate_reply(keywords, reply) - if reply and surprise > max_surprise and reply != keywords: - max_surprise = surprise - output = reply - - return ''.join(output).capitalize() - - def evaluate_reply(self, keys, words): - state = {'num': 0, 'entropy': 0.0} - if words: - - def evaluate(node, words): - with self.get_context(node) as context: - for word in words: - symbol = self.dictionary.index(word) - context.update(symbol) - if word in keys: - prob = 0.0 - count = 0 - state['num'] += 1 - for j in xrange(self.order): - node = context.get(j) - if node is not None: - child = node.get_child(symbol, add=False) - if child: - prob += float(child.count) / node.usage - count += 1 - if count: - state['entropy'] -= math.log(prob / count) - - evaluate(self.forward, words) - evaluate(self.backward, reversed(words)) - - if state['num'] >= 8: - state['entropy'] /= math.sqrt(state['num'] - 1) - if state['num'] >= 16: - state['entropy'] /= state['num'] - return state['entropy'] - - def generate_replywords(self, keys=None): - if keys is None: - keys = [] - replies = [] - with self.get_context(self.forward) as context: - start = True - while True: - if start: - symbol = context.seed(keys) - start = False - else: - symbol = context.babble(keys, replies) - if symbol in (self.error_symbol, self.end_symbol): - break - replies.append(self.dictionary[symbol]) - context.update(symbol) - with self.get_context(self.backward) as context: - if replies: - for i in xrange(min([(len(replies) - 1), self.order]), -1, -1): - context.update(self.dictionary.index(replies[i])) - while True: - symbol = context.babble(keys, replies) - if symbol in (self.error_symbol, self.end_symbol): - break - replies.insert(0, self.dictionary[symbol]) - context.update(symbol) - - return replies - - def make_keywords(self, words): - keys = Dictionary() - for word in words: - try: - word = self.swapwords[word] - except KeyError: - pass - if (self.dictionary.find_word(word) != self.error_symbol and word[0].isalnum() and - word not in self.banwords and word not in self.auxwords and word not in keys): - keys.append(word) - - if keys: - for word in words: - try: - word = self.swapwords[word] - except KeyError: - pass - if (self.dictionary.find_word(word) != self.error_symbol and word[0].isalnum() and - word in self.auxwords and word not in keys): - keys.append(word) - - return keys - - def add_key(self, keys, word): - if (self.dictionary.find_word(word) != self.error_symbol and - self.banwords.find_word(word) == self.error_symbol and - self.auxwords.find_word(word) == self.error_symbol): - keys.add_word(word) - - def sync(self): - self.db.sync() - - def close(self): - if not self.closed: - print 'Closing database' - self.db.close() - self.closed = True - - def __del__(self): - try: - self.close() - except: - pass - - -class MegaHAL(object): - - def __init__(self, order=None, brainfile=None, timeout=None): - if order is None: - order = DEFAULT_ORDER - if brainfile is None: - brainfile = DEFAULT_BRAINFILE - if timeout is None: - timeout = DEFAULT_TIMEOUT - self.__brain = Brain(order, brainfile, timeout) - - @property - def banwords(self): - """This is a list of words which cannot be used as keywords""" - return self.__brain.banwords - - @property - def auxwords(self): - """This is a list of words which can be used as keywords only in order to supplement other keywords""" - return self.__brain.auxwords - - @property - def swapwords(self): - """The word on the left is changed to the word on the right when used as a keyword""" - return self.__brain.swapwords - - def train(self, file): - """Train the brain with textfile, each line is a phrase""" - with open(file, 'rb') as fp: - for line in fp: - line = line.strip() - if line and not line.startswith('#'): - self.learn(line) - - def learn(self, phrase): - """Learn from phrase""" - self.__brain.communicate(phrase, reply=False) - - def get_reply(self, phrase): - """Get a reply based on the phrase""" - return self.__brain.communicate(phrase) - - def get_reply_nolearn(self, phrase): - """Get a reply without updating the database""" - return self.__brain.communicate(phrase, learn=False) - - def interact(self): - """Have a friendly chat session.. ^D to exit""" - while True: - try: - phrase = raw_input('>>> ') - except EOFError: - break - if phrase: - print self.get_reply(phrase) - - def sync(self): - """Flush any changes to disk""" - self.__brain.sync() - - def close(self): - """Close database""" - self.__brain.close() diff --git a/modules/MegaHAL.py b/modules/MegaHAL.py index ea056b2..c6e830e 100644 --- a/modules/MegaHAL.py +++ b/modules/MegaHAL.py @@ -17,11 +17,11 @@ along with this program. If not, see . """ from ConfigParser import NoOptionError +import mh_python import os import re from extlib import irclib -from extlib import megahal from Module import Module @@ -35,17 +35,13 @@ class MegaHAL(Module): Module.__init__(self, config, server, modlist) - try: - filename = self.config.get(self.__class__.__name__, 'brainfile') - self.megahal = megahal.MegaHAL(None, filename, 1) - except NoOptionError: - self.megahal = megahal.MegaHAL() + mh_python.initbrain() def shutdown(self): """ Close the megahal brain. """ - self.megahal.close() + mh_python.cleanup() def reload(self): """ @@ -53,6 +49,8 @@ class MegaHAL(Module): TODO: figure out why this is broken. (probably because nothing closes the old brain file or deletes the old object, and then all hell breaks loose.) + + Is this still the case with the new MegaHAL library? """ def do(self, connection, event, nick, userhost, replypath, what, admin_unlocked): @@ -72,14 +70,13 @@ class MegaHAL(Module): line = addressed_re.sub('', line) append_nick = True - reply = self.megahal.get_reply(line) + reply = mh_python.doreply(line) elif re.search('^megahal$', line, re.IGNORECASE) is not None: - reply = self.megahal.get_reply_nolearn('') + reply = mh_python.doreply('') elif replypath is not None: # only learn if the command is not an internal one - self.megahal.learn(line) + mh_python.learn(line) - self.megahal.sync() if reply is not "": if append_nick: reply = nick + ": " + reply