Merge branch 'markov-tweaks' into 'master'
Markov tweaks: new sentence generator, new import This changes the way that sentences are generated, to ideally be a bit faster than usual, and also adds another import method that just adds text, rather than assuming IRC logs. See merge request !9
This commit is contained in:
commit
1b8faaca9e
|
@ -19,7 +19,7 @@ admin.site.register(MarkovTarget)
|
|||
admin.site.register(MarkovState)
|
||||
|
||||
|
||||
@permission_required('import_log_file', raise_exception=True)
|
||||
@permission_required('import_text_file', raise_exception=True)
|
||||
def import_file(request):
|
||||
"""Accept a file upload and turn it into markov stuff.
|
||||
|
||||
|
@ -30,31 +30,58 @@ def import_file(request):
|
|||
if request.method == 'POST':
|
||||
form = LogUploadForm(request.POST, request.FILES)
|
||||
if form.is_valid():
|
||||
log_file = request.FILES['log_file']
|
||||
context = form.cleaned_data['context']
|
||||
ignores = form.cleaned_data['ignore_nicks'].split(',')
|
||||
strips = form.cleaned_data['strip_prefixes'].split(' ')
|
||||
if form.cleaned_data['text_file_format'] == LogUploadForm.FILE_FORMAT_WEECHAT:
|
||||
text_file = request.FILES['text_file']
|
||||
context = form.cleaned_data['context']
|
||||
ignores = form.cleaned_data['ignore_nicks'].split(',')
|
||||
strips = form.cleaned_data['strip_prefixes'].split(' ')
|
||||
|
||||
whos = []
|
||||
for line in log_file:
|
||||
log.debug(line)
|
||||
(timestamp, who, what) = line.decode('utf-8').split('\t', 2)
|
||||
whos = []
|
||||
for line in text_file:
|
||||
log.debug(line)
|
||||
(timestamp, who, what) = line.decode('utf-8').split('\t', 2)
|
||||
|
||||
if who in ('-->', '<--', '--', ' *'):
|
||||
continue
|
||||
if who in ('-->', '<--', '--', ' *'):
|
||||
continue
|
||||
|
||||
if who in ignores:
|
||||
continue
|
||||
if who in ignores:
|
||||
continue
|
||||
|
||||
whos.append(who)
|
||||
whos.append(who)
|
||||
|
||||
# this is a line we probably care about now
|
||||
what = [x for x in what.rstrip().split(' ') if x not in strips]
|
||||
markovlib.learn_line(' '.join(what), context)
|
||||
log.debug("learned")
|
||||
# this is a line we probably care about now
|
||||
what = [x for x in what.rstrip().split(' ') if x not in strips]
|
||||
markovlib.learn_line(' '.join(what), context)
|
||||
log.debug("learned")
|
||||
|
||||
log.debug(set(whos))
|
||||
form = LogUploadForm()
|
||||
log.debug(set(whos))
|
||||
form = LogUploadForm()
|
||||
elif form.cleaned_data['text_file_format'] == LogUploadForm.FILE_FORMAT_RAW_TEXT:
|
||||
text_file = request.FILES['text_file']
|
||||
context = form.cleaned_data['context']
|
||||
|
||||
k1 = MarkovState._start1
|
||||
k2 = MarkovState._start2
|
||||
for line in text_file:
|
||||
for word in [x for x in line.decode('utf-8') .rstrip().split(' ')]:
|
||||
log.info(word)
|
||||
if word:
|
||||
state, created = MarkovState.objects.get_or_create(context=context, k1=k1,
|
||||
k2=k2, v=word)
|
||||
state.count += 1
|
||||
state.save()
|
||||
|
||||
if word[-1] in ['.', '?', '!']:
|
||||
state, created = MarkovState.objects.get_or_create(context=context, k1=k2,
|
||||
k2=word, v=MarkovState._stop)
|
||||
state.count += 1
|
||||
state.save()
|
||||
|
||||
k1 = MarkovState._start1
|
||||
k2 = MarkovState._start2
|
||||
else:
|
||||
k1 = k2
|
||||
k2 = word
|
||||
else:
|
||||
form = LogUploadForm()
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
import logging
|
||||
|
||||
from django.forms import Form, CharField, FileField, ModelChoiceField
|
||||
from django.forms import Form, CharField, ChoiceField, FileField, ModelChoiceField
|
||||
|
||||
from markov.models import MarkovContext
|
||||
|
||||
|
@ -13,11 +13,20 @@ class LogUploadForm(Form):
|
|||
|
||||
"""Accept a file upload that will be imported into Markov stuff."""
|
||||
|
||||
log_file = FileField(help_text="Weechat log format.")
|
||||
FILE_FORMAT_WEECHAT = 'WEECHAT'
|
||||
FILE_FORMAT_RAW_TEXT = 'RAW'
|
||||
|
||||
FILE_FORMAT_CHOICES = (
|
||||
(FILE_FORMAT_WEECHAT, "Weechat"),
|
||||
(FILE_FORMAT_RAW_TEXT, "Raw text file"),
|
||||
)
|
||||
|
||||
text_file = FileField()
|
||||
text_file_format = ChoiceField(choices=FILE_FORMAT_CHOICES)
|
||||
context = ModelChoiceField(queryset=MarkovContext.objects.all())
|
||||
ignore_nicks = CharField(help_text="Comma-separated list of nicks to ignore.",
|
||||
ignore_nicks = CharField(help_text="Comma-separated list of nicks to ignore. For Weechat logs.",
|
||||
required=False)
|
||||
strip_prefixes = CharField(help_text="Space-separated list of line prefixes to strip.",
|
||||
strip_prefixes = CharField(help_text="Space-separated list of line prefixes to strip. For Weechat logs.",
|
||||
required=False)
|
||||
|
||||
|
||||
|
|
|
@ -55,12 +55,10 @@ class Markov(Plugin):
|
|||
topics = [x for x in line.split(' ') if len(x) >= 3]
|
||||
|
||||
return self.bot.reply(event, " ".join(markovlib.generate_line(context, topics=topics,
|
||||
min_words=min_size, max_words=max_size,
|
||||
max_sentences=1)))
|
||||
min_words=min_size, max_words=max_size)))
|
||||
else:
|
||||
return self.bot.reply(event, " ".join(markovlib.generate_line(context, min_words=min_size,
|
||||
max_words=max_size,
|
||||
max_sentences=1)))
|
||||
max_words=max_size)))
|
||||
|
||||
def handle_chatter(self, connection, event):
|
||||
"""Learn from IRC chatter."""
|
||||
|
@ -98,17 +96,13 @@ class Markov(Plugin):
|
|||
topics = [x for x in addressed_re.match(what).group(1).split(' ') if len(x) >= 3]
|
||||
|
||||
return self.bot.reply(event, "{0:s}: {1:s}"
|
||||
"".format(nick, " ".join(markovlib.generate_line(context,
|
||||
topics=topics,
|
||||
max_sentences=1))))
|
||||
"".format(nick, " ".join(markovlib.generate_line(context, topics=topics))))
|
||||
else:
|
||||
# i wasn't addressed directly, so just respond
|
||||
topics = [x for x in what.split(' ') if len(x) >= 3]
|
||||
|
||||
return self.bot.reply(event, "{0:s}"
|
||||
"".format(" ".join(markovlib.generate_line(context,
|
||||
topics=topics,
|
||||
max_sentences=1))))
|
||||
"".format(" ".join(markovlib.generate_line(context, topics=topics))))
|
||||
|
||||
|
||||
plugin = Markov
|
||||
|
|
|
@ -9,22 +9,21 @@ from markov.models import MarkovContext, MarkovState, MarkovTarget
|
|||
log = logging.getLogger('markov.lib')
|
||||
|
||||
|
||||
def generate_line(context, topics=None, min_words=15, max_words=30, max_sentences=3):
|
||||
def generate_line(context, topics=None, min_words=15, max_words=30, sentence_bias=2, max_tries=5):
|
||||
"""String multiple sentences together into a coherent sentence."""
|
||||
|
||||
tries = 0
|
||||
sentences = 0
|
||||
line = []
|
||||
while tries < 5:
|
||||
line += generate_longish_sentence(context, topics=topics, max_words=max_words)
|
||||
sentences += 1
|
||||
if sentences >= max_sentences:
|
||||
return line
|
||||
min_words_per_sentence = min_words / sentence_bias
|
||||
while tries < max_tries:
|
||||
line += generate_longish_sentence(context, topics=topics, min_words=min_words_per_sentence,
|
||||
max_words=max_words, max_tries=max_tries)
|
||||
if len(line) >= min_words:
|
||||
return line
|
||||
else:
|
||||
if line[-1][-1] not in [',', '.', '!']:
|
||||
line[-1] += random.choice([',', '.', '!'])
|
||||
if len(line) > 0:
|
||||
if line[-1][-1] not in [',', '.', '!', '?', ':']:
|
||||
line[-1] += random.choice(['?', '.', '!'])
|
||||
|
||||
tries += 1
|
||||
|
||||
|
@ -32,22 +31,26 @@ def generate_line(context, topics=None, min_words=15, max_words=30, max_sentence
|
|||
return line
|
||||
|
||||
|
||||
def generate_longish_sentence(context, topics=None, min_words=4, max_words=30):
|
||||
def generate_longish_sentence(context, topics=None, min_words=15, max_words=30, max_tries=100):
|
||||
"""Generate a Markov chain, but throw away the short ones unless we get desperate."""
|
||||
|
||||
sent = ""
|
||||
tries = 0
|
||||
while tries < 5:
|
||||
sent = generate_sentence(context, topics=topics, max_words=max_words)
|
||||
while tries < max_tries:
|
||||
sent = generate_sentence(context, topics=topics, min_words=min_words, max_words=max_words)
|
||||
if len(sent) >= min_words:
|
||||
log.debug("found a longish sentence, %s", sent)
|
||||
return sent
|
||||
else:
|
||||
log.debug("%s isn't long enough, going to try again", sent)
|
||||
|
||||
tries += 1
|
||||
|
||||
# if we got here, we need to just give up
|
||||
return generate_sentence(context)
|
||||
return sent
|
||||
|
||||
|
||||
def generate_sentence(context, topics=None, max_words=30):
|
||||
def generate_sentence(context, topics=None, min_words=15, max_words=30):
|
||||
"""Generate a Markov chain."""
|
||||
|
||||
words = []
|
||||
|
@ -64,23 +67,63 @@ def generate_sentence(context, topics=None, max_words=30):
|
|||
while len(words) <= max_words and words[0] != MarkovState._start2:
|
||||
log.debug("looking backwards for '{0:s}'".format(words[0]))
|
||||
new_states = MarkovState.objects.filter(context=context, v=words[0])
|
||||
words.insert(0, get_word_out_of_states(new_states, backwards=True))
|
||||
# if we find a start, use it
|
||||
if MarkovState._start2 in new_states:
|
||||
log.debug("found a start2 in the results, intentionally picking it")
|
||||
words.insert(0, MarkovState._start2)
|
||||
else:
|
||||
words.insert(0, get_word_out_of_states(new_states, backwards=True))
|
||||
log.debug("picked %s", words[0])
|
||||
|
||||
# if we didn't get topic stuff, we need to start (forwards) here
|
||||
# if what we found is too long, abandon it, sadly
|
||||
if len(words) > max_words:
|
||||
log.debug("%s is too long, i'm going to give up on it", words)
|
||||
words.clear()
|
||||
|
||||
# if we didn't get topic stuff, we need to start (forwards) here, otherwise we use
|
||||
# what we already put together (obviously)
|
||||
if len(words) == 0:
|
||||
words = [MarkovState._start1, MarkovState._start2]
|
||||
|
||||
i = len(words)
|
||||
while len(words) <= max_words and words[-1] != MarkovState._stop:
|
||||
while words[-1] != MarkovState._stop:
|
||||
log.debug("looking for '{0:s}','{1:s}'".format(words[i-2], words[i-1]))
|
||||
new_states = MarkovState.objects.filter(context=context, k1=words[i-2], k2=words[i-1])
|
||||
log.debug("states retrieved")
|
||||
words.append(get_word_out_of_states(new_states))
|
||||
|
||||
# try to find states that are in our targets
|
||||
if topics and len(topics):
|
||||
target_hits = list(set(words).intersection(set(topics)))
|
||||
else:
|
||||
target_hits = []
|
||||
|
||||
if len(words) > min_words and MarkovState._stop in new_states:
|
||||
# if we're over min_words, and got a stop naturally, use it
|
||||
log.debug("found a stop in the results, intentionally picking it")
|
||||
words.append(MarkovState._stop)
|
||||
elif len(target_hits) > 0:
|
||||
# if there's a target word in the states, pick it
|
||||
target_hit = random.choice(target_hits)
|
||||
log.debug("found a topic hit %s, using it", target_hit)
|
||||
topics.remove(target_hit)
|
||||
words.append(target_hit)
|
||||
elif len(words) <= min_words:
|
||||
# if we still need more words, intentionally avoid stop
|
||||
words.append(get_word_out_of_states(new_states.exclude(v=MarkovState._stop)))
|
||||
log.debug("picked (stop avoidance) %s", words[-1])
|
||||
else:
|
||||
words.append(get_word_out_of_states(new_states))
|
||||
log.debug("picked %s", words[-1])
|
||||
i += 1
|
||||
|
||||
words = [word for word in words if word not in
|
||||
(MarkovState._start1, MarkovState._start2, MarkovState._stop)]
|
||||
|
||||
# if what we found is too long, abandon it, sadly
|
||||
if len(words) > max_words:
|
||||
log.debug("%s is too long, i'm going to give up on it", words)
|
||||
words.clear()
|
||||
|
||||
return words
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('markov', '0002_auto_20150514_2317'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='markovstate',
|
||||
options={'permissions': set([('import_text_file', 'Can import states from a text file'), ('teach_line', 'Can teach lines')])},
|
||||
),
|
||||
]
|
|
@ -59,7 +59,7 @@ class MarkovState(models.Model):
|
|||
['context', 'v'],
|
||||
]
|
||||
permissions = {
|
||||
('import_log_file', "Can import states from a log file"),
|
||||
('import_text_file', "Can import states from a text file"),
|
||||
('teach_line', "Can teach lines"),
|
||||
}
|
||||
unique_together = ('context', 'k1', 'k2', 'v')
|
||||
|
|
Loading…
Reference in New Issue