markov: tweaks to the sentence generator

this tunes things a bit in the sentence generator, trying to favor complete sentences over the min/max word counts, which are still kinda heeded but not as militantly. this *should* create more interesting chains, especially with topics, without really breaking things, but this certainly needs some testing before we can see if it's actually right
2016-06-30 22:21:02 -05:00 · 2016-06-30 22:21:02 -05:00 · 897f29c8d4
commit 897f29c8d4
parent 70032dc42b
1 changed files with 45 additions and 12 deletions
--- a/dr_botzo/markov/lib.py
+++ b/dr_botzo/markov/lib.py
@ -9,14 +9,16 @@ from markov.models import MarkovContext, MarkovState, MarkovTarget
 log = logging.getLogger('markov.lib')


-def generate_line(context, topics=None, min_words=15, max_words=30, max_sentences=3):
+def generate_line(context, topics=None, min_words=15, max_words=30, max_sentences=3, max_tries=5):
    """String multiple sentences together into a coherent sentence."""

    tries = 0
    sentences = 0
    line = []
-    while tries < 5:
-        line += generate_longish_sentence(context, topics=topics, max_words=max_words)
+    min_words_per_sentence = min_words / max_sentences
+    while tries < max_tries:
+        line += generate_longish_sentence(context, topics=topics, min_words=min_words_per_sentence,
+                                          max_words=max_words, max_tries=max_tries)
        sentences += 1
        if sentences >= max_sentences:
            return line
@ -32,22 +34,23 @@ def generate_line(context, topics=None, min_words=15, max_words=30, max_sentence
    return line


-def generate_longish_sentence(context, topics=None, min_words=4, max_words=30):
+def generate_longish_sentence(context, topics=None, min_words=15, max_words=30, max_tries=5):
    """Generate a Markov chain, but throw away the short ones unless we get desperate."""

+    sent = ""
    tries = 0
-    while tries < 5:
-        sent = generate_sentence(context, topics=topics, max_words=max_words)
+    while tries < max_tries:
+        sent = generate_sentence(context, topics=topics, min_words=min_words, max_words=max_words)
        if len(sent) >= min_words:
            return sent

        tries += 1

    # if we got here, we need to just give up
-    return generate_sentence(context)
+    return sent


-def generate_sentence(context, topics=None, max_words=30):
+def generate_sentence(context, topics=None, min_words=15, max_words=30):
    """Generate a Markov chain."""

    words = []
@ -64,18 +67,48 @@ def generate_sentence(context, topics=None, max_words=30):
            while len(words) <= max_words and words[0] != MarkovState._start2:
                log.debug("looking backwards for '{0:s}'".format(words[0]))
                new_states = MarkovState.objects.filter(context=context, v=words[0])
-                words.insert(0, get_word_out_of_states(new_states, backwards=True))
+                # if we find a start, use it
+                if MarkovState._start2 in new_states:
+                    log.debug("found a start2 in the results, intentionally picking it")
+                    words.insert(0, MarkovState._start2)
+                else:
+                    words.insert(0, get_word_out_of_states(new_states, backwards=True))
+                    log.debug("picked %s", words[0])

-    # if we didn't get topic stuff, we need to start (forwards) here
+    # if what we found is too long, abandon it, sadly
+    if len(words) > max_words:
+        log.debug("%s is too long, i'm going to give up on it", words)
+        words.clear()
+
+    # if we didn't get topic stuff, we need to start (forwards) here, otherwise we use
+    # what we already put together (obviously)
    if len(words) == 0:
        words = [MarkovState._start1, MarkovState._start2]

    i = len(words)
-    while len(words) <= max_words and words[-1] != MarkovState._stop:
+    while words[-1] != MarkovState._stop:
        log.debug("looking for '{0:s}','{1:s}'".format(words[i-2], words[i-1]))
        new_states = MarkovState.objects.filter(context=context, k1=words[i-2], k2=words[i-1])
        log.debug("states retrieved")
-        words.append(get_word_out_of_states(new_states))
+
+        # try to find states that are in our targets
+        if topics and len(topics):
+            target_hits = list(set(words).intersection(set(topics)))
+        else:
+            target_hits = []
+
+        # if we're over min_words, and got a stop naturally, use it
+        if len(words) > min_words and MarkovState._stop in new_states:
+            log.debug("found a stop in the results, intentionally picking it")
+            words.append(MarkovState._stop)
+        elif len(target_hits) > 0:
+            target_hit = random.choice(target_hits)
+            log.debug("found a topic hit %s, using it", target_hit)
+            topics.remove(target_hit)
+            words.append(target_hit)
+        else:
+            words.append(get_word_out_of_states(new_states))
+            log.debug("picked %s", words[-1])
        i += 1

    words = [word for word in words if word not in