From 26ec854c67b0e94696235847d6875acfbf9abfb7 Mon Sep 17 00:00:00 2001
From: "Brian S. Stephan" <bss@incorporeal.org>
Date: Sun, 29 Jul 2012 15:43:15 -0500
Subject: [PATCH] Markov: try to avoid "nick:" starts to extra chaining

when starting another sentence because the main one is too short,
do a bit of work in an attempt to avoid "nick: blah" starts, since
they're fairly common. instead we just ignore nick: and start with
"blah blah"
---
 modules/Markov.py | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/modules/Markov.py b/modules/Markov.py
index c0349ea..b35e8f7 100644
--- a/modules/Markov.py
+++ b/modules/Markov.py
@@ -414,17 +414,38 @@ class Markov(Module):
                         gen_words[-1] = sentence_end + eos_punctuation.pop()
                         self.log.debug("monkeyed with end of sentence, it's now: {0:s}".format(gen_words[:-1]))
 
+                    new_chain_words = []
                     # new word 1
                     key_hits = self._retrieve_chains_for_key(self.start1, self.start2, context_id)
-                    gen_words.append(self._get_suitable_word_from_choices(key_hits, gen_words, min_size))
-
+                    new_chain_words.append(self._get_suitable_word_from_choices(key_hits, gen_words, min_size))
                     # the database is probably empty if we got a stop from this
-                    if gen_words[-1] == self.stop:
+                    if gen_words[0] == self.stop:
                         break
+                    # new word 2
+                    key_hits = self._retrieve_chains_for_key(self.start2, new_chain_words[0], context_id)
+                    new_chain_words.append(self._get_suitable_word_from_choices(key_hits, gen_words, min_size))
+                    if gen_words[1] != self.stop:
+                        # two valid words, try for a third and check for "foo:"
+
+                        # new word 3 (which we may need below)
+                        key_hits = self._retrieve_chains_for_key(new_chain_words[0], new_chain_words[1], context_id)
+                        new_chain_words.append(self._get_suitable_word_from_choices(key_hits, gen_words, min_size))
+
+                        # if the first word is "foo:", start with the second
+                        addressing_suffixes = [':', ',']
+                        if gen_words[0][-1] in addressing_suffixes:
+                            gen_words += new_chain_words[1:]
+                            self.log.debug("appending following anti-address " \
+                                           "new_chain_words: {0:s}".format(new_chain_words[1:]))
+                        else:
+                            gen_words += new_chain_words[0:]
+                            self.log.debug("appending following extended " \
+                                           "new_chain_words: {0:s}".format(new_chain_words[0:]))
                     else:
-                        # new word 2
-                        key_hits = self._retrieve_chains_for_key(self.start2, gen_words[-1], context_id)
-                        gen_words.append(self._get_suitable_word_from_choices(key_hits, gen_words, min_size))
+                        # well, we got one word out of this... let's go with it
+                        # and let the loop check if we need more
+                        self.log.debug("appending following short new_chain_words: {0:s}".format(new_chain_words))
+                        gen_words += new_chain_words
 
         # chop off the seed data at the start
         gen_words = gen_words[2:]