diff --git a/dr_botzo/markov/admin.py b/dr_botzo/markov/admin.py index aef6201..becee38 100644 --- a/dr_botzo/markov/admin.py +++ b/dr_botzo/markov/admin.py @@ -19,7 +19,7 @@ admin.site.register(MarkovTarget) admin.site.register(MarkovState) -@permission_required('import_log_file', raise_exception=True) +@permission_required('import_text_file', raise_exception=True) def import_file(request): """Accept a file upload and turn it into markov stuff. @@ -30,31 +30,58 @@ def import_file(request): if request.method == 'POST': form = LogUploadForm(request.POST, request.FILES) if form.is_valid(): - log_file = request.FILES['log_file'] - context = form.cleaned_data['context'] - ignores = form.cleaned_data['ignore_nicks'].split(',') - strips = form.cleaned_data['strip_prefixes'].split(' ') + if form.cleaned_data['text_file_format'] == LogUploadForm.FILE_FORMAT_WEECHAT: + text_file = request.FILES['text_file'] + context = form.cleaned_data['context'] + ignores = form.cleaned_data['ignore_nicks'].split(',') + strips = form.cleaned_data['strip_prefixes'].split(' ') - whos = [] - for line in log_file: - log.debug(line) - (timestamp, who, what) = line.decode('utf-8').split('\t', 2) + whos = [] + for line in text_file: + log.debug(line) + (timestamp, who, what) = line.decode('utf-8').split('\t', 2) - if who in ('-->', '<--', '--', ' *'): - continue + if who in ('-->', '<--', '--', ' *'): + continue - if who in ignores: - continue + if who in ignores: + continue - whos.append(who) + whos.append(who) - # this is a line we probably care about now - what = [x for x in what.rstrip().split(' ') if x not in strips] - markovlib.learn_line(' '.join(what), context) - log.debug("learned") + # this is a line we probably care about now + what = [x for x in what.rstrip().split(' ') if x not in strips] + markovlib.learn_line(' '.join(what), context) + log.debug("learned") - log.debug(set(whos)) - form = LogUploadForm() + log.debug(set(whos)) + form = LogUploadForm() + elif form.cleaned_data['text_file_format'] == LogUploadForm.FILE_FORMAT_RAW_TEXT: + text_file = request.FILES['text_file'] + context = form.cleaned_data['context'] + + k1 = MarkovState._start1 + k2 = MarkovState._start2 + for line in text_file: + for word in [x for x in line.decode('utf-8') .rstrip().split(' ')]: + log.info(word) + if word: + state, created = MarkovState.objects.get_or_create(context=context, k1=k1, + k2=k2, v=word) + state.count += 1 + state.save() + + if word[-1] in ['.', '?', '!']: + state, created = MarkovState.objects.get_or_create(context=context, k1=k2, + k2=word, v=MarkovState._stop) + state.count += 1 + state.save() + + k1 = MarkovState._start1 + k2 = MarkovState._start2 + else: + k1 = k2 + k2 = word else: form = LogUploadForm() diff --git a/dr_botzo/markov/forms.py b/dr_botzo/markov/forms.py index 4b442e5..a07093a 100644 --- a/dr_botzo/markov/forms.py +++ b/dr_botzo/markov/forms.py @@ -2,7 +2,7 @@ import logging -from django.forms import Form, CharField, FileField, ModelChoiceField +from django.forms import Form, CharField, ChoiceField, FileField, ModelChoiceField from markov.models import MarkovContext @@ -13,11 +13,20 @@ class LogUploadForm(Form): """Accept a file upload that will be imported into Markov stuff.""" - log_file = FileField(help_text="Weechat log format.") + FILE_FORMAT_WEECHAT = 'WEECHAT' + FILE_FORMAT_RAW_TEXT = 'RAW' + + FILE_FORMAT_CHOICES = ( + (FILE_FORMAT_WEECHAT, "Weechat"), + (FILE_FORMAT_RAW_TEXT, "Raw text file"), + ) + + text_file = FileField() + text_file_format = ChoiceField(choices=FILE_FORMAT_CHOICES) context = ModelChoiceField(queryset=MarkovContext.objects.all()) - ignore_nicks = CharField(help_text="Comma-separated list of nicks to ignore.", + ignore_nicks = CharField(help_text="Comma-separated list of nicks to ignore. For Weechat logs.", required=False) - strip_prefixes = CharField(help_text="Space-separated list of line prefixes to strip.", + strip_prefixes = CharField(help_text="Space-separated list of line prefixes to strip. For Weechat logs.", required=False) diff --git a/dr_botzo/markov/ircplugin.py b/dr_botzo/markov/ircplugin.py index 8bcfb9c..732767f 100644 --- a/dr_botzo/markov/ircplugin.py +++ b/dr_botzo/markov/ircplugin.py @@ -55,12 +55,10 @@ class Markov(Plugin): topics = [x for x in line.split(' ') if len(x) >= 3] return self.bot.reply(event, " ".join(markovlib.generate_line(context, topics=topics, - min_words=min_size, max_words=max_size, - max_sentences=1))) + min_words=min_size, max_words=max_size))) else: return self.bot.reply(event, " ".join(markovlib.generate_line(context, min_words=min_size, - max_words=max_size, - max_sentences=1))) + max_words=max_size))) def handle_chatter(self, connection, event): """Learn from IRC chatter.""" @@ -98,17 +96,13 @@ class Markov(Plugin): topics = [x for x in addressed_re.match(what).group(1).split(' ') if len(x) >= 3] return self.bot.reply(event, "{0:s}: {1:s}" - "".format(nick, " ".join(markovlib.generate_line(context, - topics=topics, - max_sentences=1)))) + "".format(nick, " ".join(markovlib.generate_line(context, topics=topics)))) else: # i wasn't addressed directly, so just respond topics = [x for x in what.split(' ') if len(x) >= 3] return self.bot.reply(event, "{0:s}" - "".format(" ".join(markovlib.generate_line(context, - topics=topics, - max_sentences=1)))) + "".format(" ".join(markovlib.generate_line(context, topics=topics)))) plugin = Markov diff --git a/dr_botzo/markov/lib.py b/dr_botzo/markov/lib.py index a03bab6..9e7243a 100644 --- a/dr_botzo/markov/lib.py +++ b/dr_botzo/markov/lib.py @@ -9,22 +9,21 @@ from markov.models import MarkovContext, MarkovState, MarkovTarget log = logging.getLogger('markov.lib') -def generate_line(context, topics=None, min_words=15, max_words=30, max_sentences=3): +def generate_line(context, topics=None, min_words=15, max_words=30, sentence_bias=2, max_tries=5): """String multiple sentences together into a coherent sentence.""" tries = 0 - sentences = 0 line = [] - while tries < 5: - line += generate_longish_sentence(context, topics=topics, max_words=max_words) - sentences += 1 - if sentences >= max_sentences: - return line + min_words_per_sentence = min_words / sentence_bias + while tries < max_tries: + line += generate_longish_sentence(context, topics=topics, min_words=min_words_per_sentence, + max_words=max_words, max_tries=max_tries) if len(line) >= min_words: return line else: - if line[-1][-1] not in [',', '.', '!']: - line[-1] += random.choice([',', '.', '!']) + if len(line) > 0: + if line[-1][-1] not in [',', '.', '!', '?', ':']: + line[-1] += random.choice(['?', '.', '!']) tries += 1 @@ -32,22 +31,26 @@ def generate_line(context, topics=None, min_words=15, max_words=30, max_sentence return line -def generate_longish_sentence(context, topics=None, min_words=4, max_words=30): +def generate_longish_sentence(context, topics=None, min_words=15, max_words=30, max_tries=100): """Generate a Markov chain, but throw away the short ones unless we get desperate.""" + sent = "" tries = 0 - while tries < 5: - sent = generate_sentence(context, topics=topics, max_words=max_words) + while tries < max_tries: + sent = generate_sentence(context, topics=topics, min_words=min_words, max_words=max_words) if len(sent) >= min_words: + log.debug("found a longish sentence, %s", sent) return sent + else: + log.debug("%s isn't long enough, going to try again", sent) tries += 1 # if we got here, we need to just give up - return generate_sentence(context) + return sent -def generate_sentence(context, topics=None, max_words=30): +def generate_sentence(context, topics=None, min_words=15, max_words=30): """Generate a Markov chain.""" words = [] @@ -64,23 +67,63 @@ def generate_sentence(context, topics=None, max_words=30): while len(words) <= max_words and words[0] != MarkovState._start2: log.debug("looking backwards for '{0:s}'".format(words[0])) new_states = MarkovState.objects.filter(context=context, v=words[0]) - words.insert(0, get_word_out_of_states(new_states, backwards=True)) + # if we find a start, use it + if MarkovState._start2 in new_states: + log.debug("found a start2 in the results, intentionally picking it") + words.insert(0, MarkovState._start2) + else: + words.insert(0, get_word_out_of_states(new_states, backwards=True)) + log.debug("picked %s", words[0]) - # if we didn't get topic stuff, we need to start (forwards) here + # if what we found is too long, abandon it, sadly + if len(words) > max_words: + log.debug("%s is too long, i'm going to give up on it", words) + words.clear() + + # if we didn't get topic stuff, we need to start (forwards) here, otherwise we use + # what we already put together (obviously) if len(words) == 0: words = [MarkovState._start1, MarkovState._start2] i = len(words) - while len(words) <= max_words and words[-1] != MarkovState._stop: + while words[-1] != MarkovState._stop: log.debug("looking for '{0:s}','{1:s}'".format(words[i-2], words[i-1])) new_states = MarkovState.objects.filter(context=context, k1=words[i-2], k2=words[i-1]) log.debug("states retrieved") - words.append(get_word_out_of_states(new_states)) + + # try to find states that are in our targets + if topics and len(topics): + target_hits = list(set(words).intersection(set(topics))) + else: + target_hits = [] + + if len(words) > min_words and MarkovState._stop in new_states: + # if we're over min_words, and got a stop naturally, use it + log.debug("found a stop in the results, intentionally picking it") + words.append(MarkovState._stop) + elif len(target_hits) > 0: + # if there's a target word in the states, pick it + target_hit = random.choice(target_hits) + log.debug("found a topic hit %s, using it", target_hit) + topics.remove(target_hit) + words.append(target_hit) + elif len(words) <= min_words: + # if we still need more words, intentionally avoid stop + words.append(get_word_out_of_states(new_states.exclude(v=MarkovState._stop))) + log.debug("picked (stop avoidance) %s", words[-1]) + else: + words.append(get_word_out_of_states(new_states)) + log.debug("picked %s", words[-1]) i += 1 words = [word for word in words if word not in (MarkovState._start1, MarkovState._start2, MarkovState._stop)] + # if what we found is too long, abandon it, sadly + if len(words) > max_words: + log.debug("%s is too long, i'm going to give up on it", words) + words.clear() + return words diff --git a/dr_botzo/markov/migrations/0003_auto_20161112_2348.py b/dr_botzo/markov/migrations/0003_auto_20161112_2348.py new file mode 100644 index 0000000..bededdd --- /dev/null +++ b/dr_botzo/markov/migrations/0003_auto_20161112_2348.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('markov', '0002_auto_20150514_2317'), + ] + + operations = [ + migrations.AlterModelOptions( + name='markovstate', + options={'permissions': set([('import_text_file', 'Can import states from a text file'), ('teach_line', 'Can teach lines')])}, + ), + ] diff --git a/dr_botzo/markov/models.py b/dr_botzo/markov/models.py index 560615f..c78a19b 100644 --- a/dr_botzo/markov/models.py +++ b/dr_botzo/markov/models.py @@ -59,7 +59,7 @@ class MarkovState(models.Model): ['context', 'v'], ] permissions = { - ('import_log_file', "Can import states from a log file"), + ('import_text_file', "Can import states from a text file"), ('teach_line', "Can teach lines"), } unique_together = ('context', 'k1', 'k2', 'v')