dr.botzo/markov/lib.py

"""Provide methods for manipulating markov chain processing."""
import logging
import random

from django.db.models import Sum

from markov.models import MarkovContext, MarkovState, MarkovTarget

log = logging.getLogger(__name__)


def generate_line(context, topics=None, min_words=15, max_words=30, sentence_bias=2, max_tries=5):
    """Combine multiple sentences together into a coherent sentence."""
    tries = 0
    line = []
    min_words_per_sentence = min_words / sentence_bias
    while tries < max_tries:
        line += generate_longish_sentence(context, topics=topics, min_words=min_words_per_sentence,
                                          max_words=max_words, max_tries=max_tries)
        if len(line) >= min_words:
            return line
        else:
            if len(line) > 0:
                if line[-1][-1] not in [',', '.', '!', '?', ':']:
                    line[-1] += random.SystemRandom().choice(['?', '.', '!'])

        tries += 1

    # if we got here, we need to give up
    return line


def generate_longish_sentence(context, topics=None, min_words=15, max_words=30, max_tries=100):
    """Generate a Markov chain, but throw away the short ones unless we get desperate."""
    sent = ""
    tries = 0
    while tries < max_tries:
        sent = generate_sentence(context, topics=topics, min_words=min_words, max_words=max_words)
        if len(sent) >= min_words:
            log.debug("found a longish sentence, %s", sent)
            return sent
        else:
            log.debug("%s isn't long enough, going to try again", sent)

        tries += 1

    # if we got here, we need to just give up
    return sent


def generate_sentence(context, topics=None, min_words=15, max_words=30):
    """Generate a Markov chain."""
    words = []
    # if we have topics, try to work from it and work backwards
    if topics:
        topic_word = random.SystemRandom().choice(topics)
        topics.remove(topic_word)
        log.debug("looking for topic '%s'", topic_word)
        new_states = MarkovState.objects.filter(context=context, v=topic_word)

        if len(new_states) > 0:
            log.debug("found '%s', starting backwards", topic_word)
            words.insert(0, topic_word)
            while len(words) <= max_words and words[0] != MarkovState._start2:
                log.debug("looking backwards for '%s'", words[0])
                new_states = MarkovState.objects.filter(context=context, v=words[0])
                # if we find a start, use it
                if MarkovState._start2 in new_states:
                    log.debug("found a start2 in the results, intentionally picking it")
                    words.insert(0, MarkovState._start2)
                else:
                    words.insert(0, get_word_out_of_states(new_states, backwards=True))
                    log.debug("picked %s", words[0])

    # if what we found is too long, abandon it, sadly
    if len(words) > max_words:
        log.debug("%s is too long, i'm going to give up on it", words)
        words.clear()

    # if we didn't get topic stuff, we need to start (forwards) here, otherwise we use
    # what we already put together (obviously)
    if len(words) == 0:
        words = [MarkovState._start1, MarkovState._start2]

    i = len(words)
    while words[-1] != MarkovState._stop:
        log.debug("looking for '%s','%s'", words[i-2], words[i-1])
        new_states = MarkovState.objects.filter(context=context, k1=words[i-2], k2=words[i-1])
        log.debug("states retrieved")

        # try to find states that are in our targets
        if topics and len(topics):
            target_hits = list(set(words).intersection(set(topics)))
        else:
            target_hits = []

        if len(words) > min_words and MarkovState._stop in new_states:
            # if we're over min_words, and got a stop naturally, use it
            log.debug("found a stop in the results, intentionally picking it")
            words.append(MarkovState._stop)
        elif len(target_hits) > 0:
            # if there's a target word in the states, pick it
            target_hit = random.SystemRandom().choice(target_hits)
            log.debug("found a topic hit %s, using it", target_hit)
            topics.remove(target_hit)
            words.append(target_hit)
        elif len(words) <= min_words:
            # if we still need more words, intentionally avoid stop
            words.append(get_word_out_of_states(new_states.exclude(v=MarkovState._stop)))
            log.debug("picked (stop avoidance) %s", words[-1])
        else:
            words.append(get_word_out_of_states(new_states))
            log.debug("picked %s", words[-1])
        i += 1

    words = [word for word in words if word not in
             (MarkovState._start1, MarkovState._start2, MarkovState._stop)]

    # if what we found is too long, abandon it, sadly
    if len(words) > max_words:
        log.debug("%s is too long, i'm going to give up on it", words)
        words.clear()

    return words


def get_or_create_target_context(target_name):
    """Return the context for a provided nick/channel, creating missing ones."""
    target_name = target_name.lower()

    # find the stuff, or create it
    try:
        target = MarkovTarget.objects.get(name=target_name)
    except MarkovTarget.DoesNotExist:
        # we need to create a context and a target, and we have to make the context first
        # make a context --- lacking a good idea, just create one with this target name until configured otherwise
        context, c = MarkovContext.objects.get_or_create(name=target_name)
        target, c = MarkovTarget.objects.get_or_create(name=target_name, context=context)

        return target.context

    try:
        return target.context
    except MarkovContext.DoesNotExist:
        # make a context --- lacking a good idea, just create one with this target name until configured otherwise
        context, c = MarkovContext.objects.get_or_create(name=target_name)
        target.context = context
        target.save()

        return target.context


def get_word_out_of_states(states, backwards=False):
    """Pick one random word out of the given states."""
    # work around possible broken data, where a k1,k2 should have a value but doesn't
    if len(states) == 0:
        states = MarkovState.objects.filter(v=MarkovState._stop)

    new_word = ''
    running = 0
    count_sum = states.aggregate(Sum('count'))['count__sum']
    if not count_sum:
        # this being None probably means there's no data for this context
        raise ValueError("no markov states to generate from")

    hit = random.SystemRandom().randint(0, count_sum)

    log.debug("sum: %s hit: %s", count_sum, hit)

    states_itr = states.iterator()
    for state in states_itr:
        running += state.count
        if running >= hit:
            if backwards:
                new_word = state.k2
            else:
                new_word = state.v

            break

    log.debug("found '%s'", new_word)
    return new_word


def learn_line(line, context):
    """Create a bunch of MarkovStates for a given line of text."""
    log.debug("learning %s...", line[:40])

    words = line.split()
    words = [MarkovState._start1, MarkovState._start2] + words + [MarkovState._stop]

    for word in words:
        if len(word) > MarkovState._meta.get_field('k1').max_length:
            return

    for i, word in enumerate(words):
        log.debug("'%s','%s' -> '%s'", words[i], words[i+1], words[i+2])
        state, created = MarkovState.objects.get_or_create(context=context,
                                                           k1=words[i],
                                                           k2=words[i+1],
                                                           v=words[i+2])
        state.count += 1
        state.save()

        if i > len(words) - 4:
            break
linter fixes for markov library methods 2023-02-16 16:29:48 -06:00			`"""Provide methods for manipulating markov chain processing."""`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`import logging`
			`import random`

			`from django.db.models import Sum`

			`from markov.models import MarkovContext, MarkovState, MarkovTarget`

linter fixes for markov library methods 2023-02-16 16:29:48 -06:00			`log = logging.getLogger(__name__)`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00

markov: replace max_sentences with sentance_bias the theory here is that counting the number of sentences generated is kind of silly, if we're already specifying min/max word counts, we probably just want to fall into that range, and not really care how many sentences we get meanwhile, we were overloading max_sentences to also calculate how long any one sentence must be, which is kind of a weird thing to derive, so we're going to drop the max_sentences language and call this more what it is, a bias towards the number of sentences that might be seen 2016-06-30 23:26:04 -05:00			`def generate_line(context, topics=None, min_words=15, max_words=30, sentence_bias=2, max_tries=5):`
linter fixes for markov library methods 2023-02-16 16:29:48 -06:00			`"""Combine multiple sentences together into a coherent sentence."""`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`tries = 0`
			`line = []`
markov: replace max_sentences with sentance_bias the theory here is that counting the number of sentences generated is kind of silly, if we're already specifying min/max word counts, we probably just want to fall into that range, and not really care how many sentences we get meanwhile, we were overloading max_sentences to also calculate how long any one sentence must be, which is kind of a weird thing to derive, so we're going to drop the max_sentences language and call this more what it is, a bias towards the number of sentences that might be seen 2016-06-30 23:26:04 -05:00			`min_words_per_sentence = min_words / sentence_bias`
markov: tweaks to the sentence generator this tunes things a bit in the sentence generator, trying to favor complete sentences over the min/max word counts, which are still kinda heeded but not as militantly. this should create more interesting chains, especially with topics, without really breaking things, but this certainly needs some testing before we can see if it's actually right 2016-06-30 22:21:02 -05:00			`while tries < max_tries:`
Revert "markov: abandon min_words_per_sentence for lines" This reverts commit 464727cc7461dfc1e3914cf00585923157564870. it turns out that without the min_words_per_sentence adjustment, the default min_words (15) is way too demanding on a lot of chains, so we're going to go back to this for the moment 2016-06-30 23:20:05 -05:00			`line += generate_longish_sentence(context, topics=topics, min_words=min_words_per_sentence,`
markov: tweaks to the sentence generator this tunes things a bit in the sentence generator, trying to favor complete sentences over the min/max word counts, which are still kinda heeded but not as militantly. this should create more interesting chains, especially with topics, without really breaking things, but this certainly needs some testing before we can see if it's actually right 2016-06-30 22:21:02 -05:00			`max_words=max_words, max_tries=max_tries)`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`if len(line) >= min_words:`
			`return line`
			`else:`
markov: don't append punctuation to nothingness this is an attempt to fix bss/dr.botzo#10 2016-12-04 10:50:02 -06:00			`if len(line) > 0:`
			`if line[-1][-1] not in [',', '.', '!', '?', ':']:`
linter fixes for markov library methods 2023-02-16 16:29:48 -06:00			`line[-1] += random.SystemRandom().choice(['?', '.', '!'])`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00
			`tries += 1`

			`# if we got here, we need to give up`
			`return line`


markov: try harder to get a right len sentence this puts additional pressure on the sentence generator, retrying many times to get something that's long but not too long. only testing on a small context so far, so this is certainly not yet ready to go live, but the results are pretty good so far 2016-06-30 22:43:26 -05:00			`def generate_longish_sentence(context, topics=None, min_words=15, max_words=30, max_tries=100):`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`"""Generate a Markov chain, but throw away the short ones unless we get desperate."""`
markov: tweaks to the sentence generator this tunes things a bit in the sentence generator, trying to favor complete sentences over the min/max word counts, which are still kinda heeded but not as militantly. this should create more interesting chains, especially with topics, without really breaking things, but this certainly needs some testing before we can see if it's actually right 2016-06-30 22:21:02 -05:00			`sent = ""`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`tries = 0`
markov: tweaks to the sentence generator this tunes things a bit in the sentence generator, trying to favor complete sentences over the min/max word counts, which are still kinda heeded but not as militantly. this should create more interesting chains, especially with topics, without really breaking things, but this certainly needs some testing before we can see if it's actually right 2016-06-30 22:21:02 -05:00			`while tries < max_tries:`
			`sent = generate_sentence(context, topics=topics, min_words=min_words, max_words=max_words)`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`if len(sent) >= min_words:`
markov: some debugging statements in longish 2016-06-30 23:05:58 -05:00			`log.debug("found a longish sentence, %s", sent)`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`return sent`
markov: some debugging statements in longish 2016-06-30 23:05:58 -05:00			`else:`
			`log.debug("%s isn't long enough, going to try again", sent)`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00
			`tries += 1`

			`# if we got here, we need to just give up`
markov: tweaks to the sentence generator this tunes things a bit in the sentence generator, trying to favor complete sentences over the min/max word counts, which are still kinda heeded but not as militantly. this should create more interesting chains, especially with topics, without really breaking things, but this certainly needs some testing before we can see if it's actually right 2016-06-30 22:21:02 -05:00			`return sent`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00

markov: tweaks to the sentence generator this tunes things a bit in the sentence generator, trying to favor complete sentences over the min/max word counts, which are still kinda heeded but not as militantly. this should create more interesting chains, especially with topics, without really breaking things, but this certainly needs some testing before we can see if it's actually right 2016-06-30 22:21:02 -05:00			`def generate_sentence(context, topics=None, min_words=15, max_words=30):`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`"""Generate a Markov chain."""`
			`words = []`
			`# if we have topics, try to work from it and work backwards`
			`if topics:`
linter fixes for markov library methods 2023-02-16 16:29:48 -06:00			`topic_word = random.SystemRandom().choice(topics)`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`topics.remove(topic_word)`
linter fixes for markov library methods 2023-02-16 16:29:48 -06:00			`log.debug("looking for topic '%s'", topic_word)`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`new_states = MarkovState.objects.filter(context=context, v=topic_word)`

			`if len(new_states) > 0:`
linter fixes for markov library methods 2023-02-16 16:29:48 -06:00			`log.debug("found '%s', starting backwards", topic_word)`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`words.insert(0, topic_word)`
			`while len(words) <= max_words and words[0] != MarkovState._start2:`
linter fixes for markov library methods 2023-02-16 16:29:48 -06:00			`log.debug("looking backwards for '%s'", words[0])`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`new_states = MarkovState.objects.filter(context=context, v=words[0])`
markov: tweaks to the sentence generator this tunes things a bit in the sentence generator, trying to favor complete sentences over the min/max word counts, which are still kinda heeded but not as militantly. this should create more interesting chains, especially with topics, without really breaking things, but this certainly needs some testing before we can see if it's actually right 2016-06-30 22:21:02 -05:00			`# if we find a start, use it`
			`if MarkovState._start2 in new_states:`
			`log.debug("found a start2 in the results, intentionally picking it")`
			`words.insert(0, MarkovState._start2)`
			`else:`
			`words.insert(0, get_word_out_of_states(new_states, backwards=True))`
			`log.debug("picked %s", words[0])`

			`# if what we found is too long, abandon it, sadly`
			`if len(words) > max_words:`
			`log.debug("%s is too long, i'm going to give up on it", words)`
			`words.clear()`

			`# if we didn't get topic stuff, we need to start (forwards) here, otherwise we use`
			`# what we already put together (obviously)`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`if len(words) == 0:`
			`words = [MarkovState._start1, MarkovState._start2]`

			`i = len(words)`
markov: tweaks to the sentence generator this tunes things a bit in the sentence generator, trying to favor complete sentences over the min/max word counts, which are still kinda heeded but not as militantly. this should create more interesting chains, especially with topics, without really breaking things, but this certainly needs some testing before we can see if it's actually right 2016-06-30 22:21:02 -05:00			`while words[-1] != MarkovState._stop:`
linter fixes for markov library methods 2023-02-16 16:29:48 -06:00			`log.debug("looking for '%s','%s'", words[i-2], words[i-1])`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`new_states = MarkovState.objects.filter(context=context, k1=words[i-2], k2=words[i-1])`
convert the project via 2to3 2016-01-16 17:58:11 -06:00			`log.debug("states retrieved")`
markov: tweaks to the sentence generator this tunes things a bit in the sentence generator, trying to favor complete sentences over the min/max word counts, which are still kinda heeded but not as militantly. this should create more interesting chains, especially with topics, without really breaking things, but this certainly needs some testing before we can see if it's actually right 2016-06-30 22:21:02 -05:00
			`# try to find states that are in our targets`
			`if topics and len(topics):`
			`target_hits = list(set(words).intersection(set(topics)))`
			`else:`
			`target_hits = []`

			`if len(words) > min_words and MarkovState._stop in new_states:`
markov: work harder to avoid short sentences not sure how effective this will be, but it's worth a shot 2016-06-30 23:16:49 -05:00			`# if we're over min_words, and got a stop naturally, use it`
markov: tweaks to the sentence generator this tunes things a bit in the sentence generator, trying to favor complete sentences over the min/max word counts, which are still kinda heeded but not as militantly. this should create more interesting chains, especially with topics, without really breaking things, but this certainly needs some testing before we can see if it's actually right 2016-06-30 22:21:02 -05:00			`log.debug("found a stop in the results, intentionally picking it")`
			`words.append(MarkovState._stop)`
			`elif len(target_hits) > 0:`
markov: work harder to avoid short sentences not sure how effective this will be, but it's worth a shot 2016-06-30 23:16:49 -05:00			`# if there's a target word in the states, pick it`
linter fixes for markov library methods 2023-02-16 16:29:48 -06:00			`target_hit = random.SystemRandom().choice(target_hits)`
markov: tweaks to the sentence generator this tunes things a bit in the sentence generator, trying to favor complete sentences over the min/max word counts, which are still kinda heeded but not as militantly. this should create more interesting chains, especially with topics, without really breaking things, but this certainly needs some testing before we can see if it's actually right 2016-06-30 22:21:02 -05:00			`log.debug("found a topic hit %s, using it", target_hit)`
			`topics.remove(target_hit)`
			`words.append(target_hit)`
markov: work harder to avoid short sentences not sure how effective this will be, but it's worth a shot 2016-06-30 23:16:49 -05:00			`elif len(words) <= min_words:`
			`# if we still need more words, intentionally avoid stop`
			`words.append(get_word_out_of_states(new_states.exclude(v=MarkovState._stop)))`
			`log.debug("picked (stop avoidance) %s", words[-1])`
markov: tweaks to the sentence generator this tunes things a bit in the sentence generator, trying to favor complete sentences over the min/max word counts, which are still kinda heeded but not as militantly. this should create more interesting chains, especially with topics, without really breaking things, but this certainly needs some testing before we can see if it's actually right 2016-06-30 22:21:02 -05:00			`else:`
			`words.append(get_word_out_of_states(new_states))`
			`log.debug("picked %s", words[-1])`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`i += 1`

			`words = [word for word in words if word not in`
			`(MarkovState._start1, MarkovState._start2, MarkovState._stop)]`

markov: try harder to get a right len sentence this puts additional pressure on the sentence generator, retrying many times to get something that's long but not too long. only testing on a small context so far, so this is certainly not yet ready to go live, but the results are pretty good so far 2016-06-30 22:43:26 -05:00			`# if what we found is too long, abandon it, sadly`
			`if len(words) > max_words:`
			`log.debug("%s is too long, i'm going to give up on it", words)`
			`words.clear()`

markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`return words`


			`def get_or_create_target_context(target_name):`
			`"""Return the context for a provided nick/channel, creating missing ones."""`
markov: force the target/context to lowercase 2015-09-17 22:35:04 -05:00			`target_name = target_name.lower()`

markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`# find the stuff, or create it`
			`try:`
markov: get_or_create_target_context properly 2015-05-15 17:02:13 -05:00			`target = MarkovTarget.objects.get(name=target_name)`
			`except MarkovTarget.DoesNotExist:`
			`# we need to create a context and a target, and we have to make the context first`
			`# make a context --- lacking a good idea, just create one with this target name until configured otherwise`
			`context, c = MarkovContext.objects.get_or_create(name=target_name)`
			`target, c = MarkovTarget.objects.get_or_create(name=target_name, context=context)`

markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`return target.context`
markov: better context/target creation ex handling 2015-09-17 22:35:23 -05:00
			`try:`
			`return target.context`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`except MarkovContext.DoesNotExist:`
			`# make a context --- lacking a good idea, just create one with this target name until configured otherwise`
			`context, c = MarkovContext.objects.get_or_create(name=target_name)`
			`target.context = context`
			`target.save()`

			`return target.context`


			`def get_word_out_of_states(states, backwards=False):`
			`"""Pick one random word out of the given states."""`
markov: fake a __stop if a k1,k2 has no v this shouldn't have happened, but i'm guessing some previous crash put some buggy data into my database, so let's just be careful and do this. a k1,k2 could have had any value for v, but not knowing what else to do in this corner case, we'll just use a stop and let the caller decide if they want to keep going 2016-01-16 23:41:46 -06:00			`# work around possible broken data, where a k1,k2 should have a value but doesn't`
			`if len(states) == 0:`
			`states = MarkovState.objects.filter(v=MarkovState._stop)`

markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`new_word = ''`
			`running = 0`
			`count_sum = states.aggregate(Sum('count'))['count__sum']`
add markov RPC method for generating a line from a context 2019-09-19 00:12:36 -05:00			`if not count_sum:`
			`# this being None probably means there's no data for this context`
			`raise ValueError("no markov states to generate from")`

linter fixes for markov library methods 2023-02-16 16:29:48 -06:00			`hit = random.SystemRandom().randint(0, count_sum)`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00
linter fixes for markov library methods 2023-02-16 16:29:48 -06:00			`log.debug("sum: %s hit: %s", count_sum, hit)`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00
			`states_itr = states.iterator()`
			`for state in states_itr:`
			`running += state.count`
			`if running >= hit:`
			`if backwards:`
			`new_word = state.k2`
			`else:`
			`new_word = state.v`

			`break`

linter fixes for markov library methods 2023-02-16 16:29:48 -06:00			`log.debug("found '%s'", new_word)`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`return new_word`


			`def learn_line(line, context):`
			`"""Create a bunch of MarkovStates for a given line of text."""`
convert the project via 2to3 2016-01-16 17:58:11 -06:00			`log.debug("learning %s...", line[:40])`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00
			`words = line.split()`
			`words = [MarkovState._start1, MarkovState._start2] + words + [MarkovState._stop]`

			`for word in words:`
			`if len(word) > MarkovState._meta.get_field('k1').max_length:`
			`return`

			`for i, word in enumerate(words):`
linter fixes for markov library methods 2023-02-16 16:29:48 -06:00			`log.debug("'%s','%s' -> '%s'", words[i], words[i+1], words[i+2])`
markov: move shared methods into lib.py 2015-05-15 08:36:17 -05:00			`state, created = MarkovState.objects.get_or_create(context=context,`
			`k1=words[i],`
			`k2=words[i+1],`
			`v=words[i+2])`
			`state.count += 1`
			`state.save()`

			`if i > len(words) - 4:`
			`break`