Merge branch 'markov-tweaks' into 'master'

Markov tweaks: new sentence generator, new import This changes the way that sentences are generated, to ideally be a bit faster than usual, and also adds another import method that just adds text, rather than assuming IRC logs. See merge request !9
2016-12-04 10:54:06 -06:00 · 2016-12-04 10:54:06 -06:00 · 1b8faaca9e
commit 1b8faaca9e
parent 70032dc42b ed66246f14
6 changed files with 144 additions and 53 deletions
--- a/dr_botzo/markov/admin.py
+++ b/dr_botzo/markov/admin.py
@ -19,7 +19,7 @@ admin.site.register(MarkovTarget)
 admin.site.register(MarkovState)


-@permission_required('import_log_file', raise_exception=True)
+@permission_required('import_text_file', raise_exception=True)
 def import_file(request):
    """Accept a file upload and turn it into markov stuff.

@ -30,31 +30,58 @@ def import_file(request):
    if request.method == 'POST':
        form = LogUploadForm(request.POST, request.FILES)
        if form.is_valid():
-            log_file = request.FILES['log_file']
-            context = form.cleaned_data['context']
-            ignores = form.cleaned_data['ignore_nicks'].split(',')
-            strips = form.cleaned_data['strip_prefixes'].split(' ')
+            if form.cleaned_data['text_file_format'] == LogUploadForm.FILE_FORMAT_WEECHAT:
+                text_file = request.FILES['text_file']
+                context = form.cleaned_data['context']
+                ignores = form.cleaned_data['ignore_nicks'].split(',')
+                strips = form.cleaned_data['strip_prefixes'].split(' ')

-            whos = []
-            for line in log_file:
-                log.debug(line)
-                (timestamp, who, what) = line.decode('utf-8').split('\t', 2)
+                whos = []
+                for line in text_file:
+                    log.debug(line)
+                    (timestamp, who, what) = line.decode('utf-8').split('\t', 2)

-                if who in ('-->', '<--', '--', ' *'):
-                    continue
+                    if who in ('-->', '<--', '--', ' *'):
+                        continue

-                if who in ignores:
-                    continue
+                    if who in ignores:
+                        continue

-                whos.append(who)
+                    whos.append(who)

-                # this is a line we probably care about now
-                what = [x for x in what.rstrip().split(' ') if x not in strips]
-                markovlib.learn_line(' '.join(what), context)
-                log.debug("learned")
+                    # this is a line we probably care about now
+                    what = [x for x in what.rstrip().split(' ') if x not in strips]
+                    markovlib.learn_line(' '.join(what), context)
+                    log.debug("learned")

-            log.debug(set(whos))
-            form = LogUploadForm()
+                log.debug(set(whos))
+                form = LogUploadForm()
+            elif form.cleaned_data['text_file_format'] == LogUploadForm.FILE_FORMAT_RAW_TEXT:
+                text_file = request.FILES['text_file']
+                context = form.cleaned_data['context']
+
+                k1 = MarkovState._start1
+                k2 = MarkovState._start2
+                for line in text_file:
+                    for word in [x for x in line.decode('utf-8')    .rstrip().split(' ')]:
+                        log.info(word)
+                        if word:
+                            state, created = MarkovState.objects.get_or_create(context=context, k1=k1,
+                                                                               k2=k2, v=word)
+                            state.count += 1
+                            state.save()
+
+                            if word[-1] in ['.', '?', '!']:
+                                state, created = MarkovState.objects.get_or_create(context=context, k1=k2,
+                                                                                   k2=word, v=MarkovState._stop)
+                                state.count += 1
+                                state.save()
+
+                                k1 = MarkovState._start1
+                                k2 = MarkovState._start2
+                            else:
+                                k1 = k2
+                                k2 = word
    else:
        form = LogUploadForm()

--- a/dr_botzo/markov/forms.py
+++ b/dr_botzo/markov/forms.py
@ -2,7 +2,7 @@

 import logging

-from django.forms import Form, CharField, FileField, ModelChoiceField
+from django.forms import Form, CharField, ChoiceField, FileField, ModelChoiceField

 from markov.models import MarkovContext

@ -13,11 +13,20 @@ class LogUploadForm(Form):

    """Accept a file upload that will be imported into Markov stuff."""

-    log_file = FileField(help_text="Weechat log format.")
+    FILE_FORMAT_WEECHAT = 'WEECHAT'
+    FILE_FORMAT_RAW_TEXT = 'RAW'
+
+    FILE_FORMAT_CHOICES = (
+        (FILE_FORMAT_WEECHAT, "Weechat"),
+        (FILE_FORMAT_RAW_TEXT, "Raw text file"),
+    )
+
+    text_file = FileField()
+    text_file_format = ChoiceField(choices=FILE_FORMAT_CHOICES)
    context = ModelChoiceField(queryset=MarkovContext.objects.all())
-    ignore_nicks = CharField(help_text="Comma-separated list of nicks to ignore.",
+    ignore_nicks = CharField(help_text="Comma-separated list of nicks to ignore. For Weechat logs.",
                             required=False)
-    strip_prefixes = CharField(help_text="Space-separated list of line prefixes to strip.",
+    strip_prefixes = CharField(help_text="Space-separated list of line prefixes to strip. For Weechat logs.",
                               required=False)


--- a/dr_botzo/markov/ircplugin.py
+++ b/dr_botzo/markov/ircplugin.py
@ -55,12 +55,10 @@ class Markov(Plugin):
            topics = [x for x in line.split(' ') if len(x) >= 3]

            return self.bot.reply(event, " ".join(markovlib.generate_line(context, topics=topics,
-                                                                          min_words=min_size, max_words=max_size,
-                                                                          max_sentences=1)))
+                                                                          min_words=min_size, max_words=max_size)))
        else:
            return self.bot.reply(event, " ".join(markovlib.generate_line(context, min_words=min_size,
-                                                                          max_words=max_size,
-                                                                          max_sentences=1)))
+                                                                          max_words=max_size)))

    def handle_chatter(self, connection, event):
        """Learn from IRC chatter."""
@ -98,17 +96,13 @@ class Markov(Plugin):
                topics = [x for x in addressed_re.match(what).group(1).split(' ') if len(x) >= 3]

                return self.bot.reply(event, "{0:s}: {1:s}"
-                                      "".format(nick, " ".join(markovlib.generate_line(context,
-                                                                                       topics=topics,
-                                                                                       max_sentences=1))))
+                                      "".format(nick, " ".join(markovlib.generate_line(context, topics=topics))))
            else:
                # i wasn't addressed directly, so just respond
                topics = [x for x in what.split(' ') if len(x) >= 3]

                return self.bot.reply(event, "{0:s}"
-                                      "".format(" ".join(markovlib.generate_line(context,
-                                                                                 topics=topics,
-                                                                                 max_sentences=1))))
+                                      "".format(" ".join(markovlib.generate_line(context, topics=topics))))


 plugin = Markov
--- a/dr_botzo/markov/lib.py
+++ b/dr_botzo/markov/lib.py
@ -9,22 +9,21 @@ from markov.models import MarkovContext, MarkovState, MarkovTarget
 log = logging.getLogger('markov.lib')


-def generate_line(context, topics=None, min_words=15, max_words=30, max_sentences=3):
+def generate_line(context, topics=None, min_words=15, max_words=30, sentence_bias=2, max_tries=5):
    """String multiple sentences together into a coherent sentence."""

    tries = 0
-    sentences = 0
    line = []
-    while tries < 5:
-        line += generate_longish_sentence(context, topics=topics, max_words=max_words)
-        sentences += 1
-        if sentences >= max_sentences:
-            return line
+    min_words_per_sentence = min_words / sentence_bias
+    while tries < max_tries:
+        line += generate_longish_sentence(context, topics=topics, min_words=min_words_per_sentence,
+                                          max_words=max_words, max_tries=max_tries)
        if len(line) >= min_words:
            return line
        else:
-            if line[-1][-1] not in [',', '.', '!']:
-                line[-1] += random.choice([',', '.', '!'])
+            if len(line) > 0:
+                if line[-1][-1] not in [',', '.', '!', '?', ':']:
+                    line[-1] += random.choice(['?', '.', '!'])

        tries += 1

@ -32,22 +31,26 @@ def generate_line(context, topics=None, min_words=15, max_words=30, max_sentence
    return line


-def generate_longish_sentence(context, topics=None, min_words=4, max_words=30):
+def generate_longish_sentence(context, topics=None, min_words=15, max_words=30, max_tries=100):
    """Generate a Markov chain, but throw away the short ones unless we get desperate."""

+    sent = ""
    tries = 0
-    while tries < 5:
-        sent = generate_sentence(context, topics=topics, max_words=max_words)
+    while tries < max_tries:
+        sent = generate_sentence(context, topics=topics, min_words=min_words, max_words=max_words)
        if len(sent) >= min_words:
+            log.debug("found a longish sentence, %s", sent)
            return sent
+        else:
+            log.debug("%s isn't long enough, going to try again", sent)

        tries += 1

    # if we got here, we need to just give up
-    return generate_sentence(context)
+    return sent


-def generate_sentence(context, topics=None, max_words=30):
+def generate_sentence(context, topics=None, min_words=15, max_words=30):
    """Generate a Markov chain."""

    words = []
@ -64,23 +67,63 @@ def generate_sentence(context, topics=None, max_words=30):
            while len(words) <= max_words and words[0] != MarkovState._start2:
                log.debug("looking backwards for '{0:s}'".format(words[0]))
                new_states = MarkovState.objects.filter(context=context, v=words[0])
-                words.insert(0, get_word_out_of_states(new_states, backwards=True))
+                # if we find a start, use it
+                if MarkovState._start2 in new_states:
+                    log.debug("found a start2 in the results, intentionally picking it")
+                    words.insert(0, MarkovState._start2)
+                else:
+                    words.insert(0, get_word_out_of_states(new_states, backwards=True))
+                    log.debug("picked %s", words[0])

-    # if we didn't get topic stuff, we need to start (forwards) here
+    # if what we found is too long, abandon it, sadly
+    if len(words) > max_words:
+        log.debug("%s is too long, i'm going to give up on it", words)
+        words.clear()
+
+    # if we didn't get topic stuff, we need to start (forwards) here, otherwise we use
+    # what we already put together (obviously)
    if len(words) == 0:
        words = [MarkovState._start1, MarkovState._start2]

    i = len(words)
-    while len(words) <= max_words and words[-1] != MarkovState._stop:
+    while words[-1] != MarkovState._stop:
        log.debug("looking for '{0:s}','{1:s}'".format(words[i-2], words[i-1]))
        new_states = MarkovState.objects.filter(context=context, k1=words[i-2], k2=words[i-1])
        log.debug("states retrieved")
-        words.append(get_word_out_of_states(new_states))
+
+        # try to find states that are in our targets
+        if topics and len(topics):
+            target_hits = list(set(words).intersection(set(topics)))
+        else:
+            target_hits = []
+
+        if len(words) > min_words and MarkovState._stop in new_states:
+            # if we're over min_words, and got a stop naturally, use it
+            log.debug("found a stop in the results, intentionally picking it")
+            words.append(MarkovState._stop)
+        elif len(target_hits) > 0:
+            # if there's a target word in the states, pick it
+            target_hit = random.choice(target_hits)
+            log.debug("found a topic hit %s, using it", target_hit)
+            topics.remove(target_hit)
+            words.append(target_hit)
+        elif len(words) <= min_words:
+            # if we still need more words, intentionally avoid stop
+            words.append(get_word_out_of_states(new_states.exclude(v=MarkovState._stop)))
+            log.debug("picked (stop avoidance) %s", words[-1])
+        else:
+            words.append(get_word_out_of_states(new_states))
+            log.debug("picked %s", words[-1])
        i += 1

    words = [word for word in words if word not in
             (MarkovState._start1, MarkovState._start2, MarkovState._stop)]

+    # if what we found is too long, abandon it, sadly
+    if len(words) > max_words:
+        log.debug("%s is too long, i'm going to give up on it", words)
+        words.clear()
+
    return words


--- a/dr_botzo/markov/migrations/0003_auto_20161112_2348.py
+++ b/dr_botzo/markov/migrations/0003_auto_20161112_2348.py
@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('markov', '0002_auto_20150514_2317'),
+    ]
+
+    operations = [
+        migrations.AlterModelOptions(
+            name='markovstate',
+            options={'permissions': set([('import_text_file', 'Can import states from a text file'), ('teach_line', 'Can teach lines')])},
+        ),
+    ]
--- a/dr_botzo/markov/models.py
+++ b/dr_botzo/markov/models.py
@ -59,7 +59,7 @@ class MarkovState(models.Model):
            ['context', 'v'],
        ]
        permissions = {
-            ('import_log_file', "Can import states from a log file"),
+            ('import_text_file', "Can import states from a text file"),
            ('teach_line', "Can teach lines"),
        }
        unique_together = ('context', 'k1', 'k2', 'v')