dr.botzo/markov/lib.py

"""Provide methods for manipulating markov chain processing."""
import logging
import random

from django.db.models import Sum

from markov.models import MarkovState

log = logging.getLogger(__name__)


def generate_line(context, topics=None, min_words=15, max_words=30, sentence_bias=2, max_tries=5):
    """Combine multiple sentences together into a coherent sentence."""
    tries = 0
    line = []
    min_words_per_sentence = min_words / sentence_bias
    while tries < max_tries:
        line += generate_longish_sentence(context, topics=topics, min_words=min_words_per_sentence,
                                          max_words=max_words, max_tries=max_tries)
        if len(line) >= min_words:
            return line
        else:
            if len(line) > 0:
                if line[-1][-1] not in [',', '.', '!', '?', ':']:
                    line[-1] += random.SystemRandom().choice(['?', '.', '!'])

        tries += 1

    # if we got here, we need to give up
    return line


def generate_longish_sentence(context, topics=None, min_words=15, max_words=30, max_tries=100):
    """Generate a Markov chain, but throw away the short ones unless we get desperate."""
    sent = ""
    tries = 0
    while tries < max_tries:
        sent = generate_sentence(context, topics=topics, min_words=min_words, max_words=max_words)
        if len(sent) >= min_words:
            log.debug("found a longish sentence, %s", sent)
            return sent
        else:
            log.debug("%s isn't long enough, going to try again", sent)

        tries += 1

    # if we got here, we need to just give up
    return sent


def generate_sentence(context, topics=None, min_words=15, max_words=30):
    """Generate a Markov chain."""
    words = []
    # if we have topics, try to work from it and work backwards
    if topics:
        topic_word = random.SystemRandom().choice(topics)
        topics.remove(topic_word)
        log.debug("looking for topic '%s'", topic_word)
        new_states = MarkovState.objects.filter(context=context, v=topic_word)

        if len(new_states) > 0:
            log.debug("found '%s', starting backwards", topic_word)
            words.insert(0, topic_word)
            while len(words) <= max_words and words[0] != MarkovState._start2:
                log.debug("looking backwards for '%s'", words[0])
                new_states = MarkovState.objects.filter(context=context, v=words[0])
                # if we find a start, use it
                if MarkovState._start2 in new_states:
                    log.debug("found a start2 in the results, intentionally picking it")
                    words.insert(0, MarkovState._start2)
                else:
                    words.insert(0, get_word_out_of_states(new_states, backwards=True))
                    log.debug("picked %s", words[0])

    # if what we found is too long, abandon it, sadly
    if len(words) > max_words:
        log.debug("%s is too long, i'm going to give up on it", words)
        words.clear()

    # if we didn't get topic stuff, we need to start (forwards) here, otherwise we use
    # what we already put together (obviously)
    if len(words) == 0:
        words = [MarkovState._start1, MarkovState._start2]

    i = len(words)
    while words[-1] != MarkovState._stop:
        log.debug("looking for '%s','%s'", words[i-2], words[i-1])
        new_states = MarkovState.objects.filter(context=context, k1=words[i-2], k2=words[i-1])
        log.debug("states retrieved")

        # try to find states that are in our targets
        if topics and len(topics):
            target_hits = list(set(words).intersection(set(topics)))
        else:
            target_hits = []

        if len(words) > min_words and MarkovState._stop in new_states:
            # if we're over min_words, and got a stop naturally, use it
            log.debug("found a stop in the results, intentionally picking it")
            words.append(MarkovState._stop)
        elif len(target_hits) > 0:
            # if there's a target word in the states, pick it
            target_hit = random.SystemRandom().choice(target_hits)
            log.debug("found a topic hit %s, using it", target_hit)
            topics.remove(target_hit)
            words.append(target_hit)
        elif len(words) <= min_words:
            # if we still need more words, intentionally avoid stop
            words.append(get_word_out_of_states(new_states.exclude(v=MarkovState._stop)))
            log.debug("picked (stop avoidance) %s", words[-1])
        else:
            words.append(get_word_out_of_states(new_states))
            log.debug("picked %s", words[-1])
        i += 1

    words = [word for word in words if word not in
             (MarkovState._start1, MarkovState._start2, MarkovState._stop)]

    # if what we found is too long, abandon it, sadly
    if len(words) > max_words:
        log.debug("%s is too long, i'm going to give up on it", words)
        words.clear()

    return words


def get_word_out_of_states(states, backwards=False):
    """Pick one random word out of the given states."""
    # work around possible broken data, where a k1,k2 should have a value but doesn't
    if len(states) == 0:
        states = MarkovState.objects.filter(v=MarkovState._stop)

    new_word = ''
    running = 0
    count_sum = states.aggregate(Sum('count'))['count__sum']
    if not count_sum:
        # this being None probably means there's no data for this context
        raise ValueError("no markov states to generate from")

    hit = random.SystemRandom().randint(0, count_sum)

    log.debug("sum: %s hit: %s", count_sum, hit)

    states_itr = states.iterator()
    for state in states_itr:
        running += state.count
        if running >= hit:
            if backwards:
                new_word = state.k2
            else:
                new_word = state.v

            break

    log.debug("found '%s'", new_word)
    return new_word


def learn_line(line, context):
    """Create a bunch of MarkovStates for a given line of text."""
    log.debug("learning %s...", line[:40])

    words = line.split()
    words = [MarkovState._start1, MarkovState._start2] + words + [MarkovState._stop]

    for word in words:
        if len(word) > MarkovState._meta.get_field('k1').max_length:
            return

    for i, word in enumerate(words):
        log.debug("'%s','%s' -> '%s'", words[i], words[i+1], words[i+2])
        state, created = MarkovState.objects.get_or_create(context=context,
                                                           k1=words[i],
                                                           k2=words[i+1],
                                                           v=words[i+2])
        state.count += 1
        state.save()

        if i > len(words) - 4:
            break