markov: import generic text files

This commit is contained in:
Brian S. Stephan 2016-11-16 12:10:42 -06:00
parent 8fded6ba6c
commit 6e21416791
4 changed files with 79 additions and 25 deletions

View File

@ -19,7 +19,7 @@ admin.site.register(MarkovTarget)
admin.site.register(MarkovState) admin.site.register(MarkovState)
@permission_required('import_log_file', raise_exception=True) @permission_required('import_text_file', raise_exception=True)
def import_file(request): def import_file(request):
"""Accept a file upload and turn it into markov stuff. """Accept a file upload and turn it into markov stuff.
@ -30,31 +30,58 @@ def import_file(request):
if request.method == 'POST': if request.method == 'POST':
form = LogUploadForm(request.POST, request.FILES) form = LogUploadForm(request.POST, request.FILES)
if form.is_valid(): if form.is_valid():
log_file = request.FILES['log_file'] if form.cleaned_data['text_file_format'] == LogUploadForm.FILE_FORMAT_WEECHAT:
context = form.cleaned_data['context'] text_file = request.FILES['text_file']
ignores = form.cleaned_data['ignore_nicks'].split(',') context = form.cleaned_data['context']
strips = form.cleaned_data['strip_prefixes'].split(' ') ignores = form.cleaned_data['ignore_nicks'].split(',')
strips = form.cleaned_data['strip_prefixes'].split(' ')
whos = [] whos = []
for line in log_file: for line in text_file:
log.debug(line) log.debug(line)
(timestamp, who, what) = line.decode('utf-8').split('\t', 2) (timestamp, who, what) = line.decode('utf-8').split('\t', 2)
if who in ('-->', '<--', '--', ' *'): if who in ('-->', '<--', '--', ' *'):
continue continue
if who in ignores: if who in ignores:
continue continue
whos.append(who) whos.append(who)
# this is a line we probably care about now # this is a line we probably care about now
what = [x for x in what.rstrip().split(' ') if x not in strips] what = [x for x in what.rstrip().split(' ') if x not in strips]
markovlib.learn_line(' '.join(what), context) markovlib.learn_line(' '.join(what), context)
log.debug("learned") log.debug("learned")
log.debug(set(whos)) log.debug(set(whos))
form = LogUploadForm() form = LogUploadForm()
elif form.cleaned_data['text_file_format'] == LogUploadForm.FILE_FORMAT_RAW_TEXT:
text_file = request.FILES['text_file']
context = form.cleaned_data['context']
k1 = MarkovState._start1
k2 = MarkovState._start2
for line in text_file:
for word in [x for x in line.decode('utf-8') .rstrip().split(' ')]:
log.info(word)
if word:
state, created = MarkovState.objects.get_or_create(context=context, k1=k1,
k2=k2, v=word)
state.count += 1
state.save()
if word[-1] in ['.', '?', '!']:
state, created = MarkovState.objects.get_or_create(context=context, k1=k2,
k2=word, v=MarkovState._stop)
state.count += 1
state.save()
k1 = MarkovState._start1
k2 = MarkovState._start2
else:
k1 = k2
k2 = word
else: else:
form = LogUploadForm() form = LogUploadForm()

View File

@ -2,7 +2,7 @@
import logging import logging
from django.forms import Form, CharField, FileField, ModelChoiceField from django.forms import Form, CharField, ChoiceField, FileField, ModelChoiceField
from markov.models import MarkovContext from markov.models import MarkovContext
@ -13,11 +13,20 @@ class LogUploadForm(Form):
"""Accept a file upload that will be imported into Markov stuff.""" """Accept a file upload that will be imported into Markov stuff."""
log_file = FileField(help_text="Weechat log format.") FILE_FORMAT_WEECHAT = 'WEECHAT'
FILE_FORMAT_RAW_TEXT = 'RAW'
FILE_FORMAT_CHOICES = (
(FILE_FORMAT_WEECHAT, "Weechat"),
(FILE_FORMAT_RAW_TEXT, "Raw text file"),
)
text_file = FileField()
text_file_format = ChoiceField(choices=FILE_FORMAT_CHOICES)
context = ModelChoiceField(queryset=MarkovContext.objects.all()) context = ModelChoiceField(queryset=MarkovContext.objects.all())
ignore_nicks = CharField(help_text="Comma-separated list of nicks to ignore.", ignore_nicks = CharField(help_text="Comma-separated list of nicks to ignore. For Weechat logs.",
required=False) required=False)
strip_prefixes = CharField(help_text="Space-separated list of line prefixes to strip.", strip_prefixes = CharField(help_text="Space-separated list of line prefixes to strip. For Weechat logs.",
required=False) required=False)

View File

@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('markov', '0002_auto_20150514_2317'),
]
operations = [
migrations.AlterModelOptions(
name='markovstate',
options={'permissions': set([('import_text_file', 'Can import states from a text file'), ('teach_line', 'Can teach lines')])},
),
]

View File

@ -59,7 +59,7 @@ class MarkovState(models.Model):
['context', 'v'], ['context', 'v'],
] ]
permissions = { permissions = {
('import_log_file', "Can import states from a log file"), ('import_text_file', "Can import states from a text file"),
('teach_line', "Can teach lines"), ('teach_line', "Can teach lines"),
} }
unique_together = ('context', 'k1', 'k2', 'v') unique_together = ('context', 'k1', 'k2', 'v')