Sou*_*ary 3 python nlp spacy multiclass-classification
我正在尝试进行多类分类并使用crowdflower 文本分类数据集。以下是我的代码:
from __future__ import unicode_literals, print_function
from __future__ import unicode_literals
from pathlib import Path
import pandas as pd
import spacy
from spacy.util import minibatch, compounding
def main(model=None, output_dir=None, n_iter=20):
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank('en') # create blank Language class
print("Created blank 'en' model")
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
textcat = nlp.create_pipe('textcat')
nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
textcat = nlp.get_pipe('textcat')
# add label to text classifier
for i in ['neutral','worry','happiness','sadness','love','surprise','fun','relief','hate','enthusiasm','boredom','anger']:
textcat.add_label(i)
df = pd.read_csv('text_emotion.csv')
df.drop(['tweet_id', 'author'], axis=1, inplace=True)
df = df[df['sentiment'] != 'empty']
train_data = list(zip(unicode(df['content']),
[{u'cats': unicode(cats)} for cats in df['sentiment']]))
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes): # only train textcat
optimizer = nlp.begin_training()
print("Training the model...")
print('{:^5}\t'.format('LOSS'))
for i in range(n_iter):
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(train_data, size=compounding(4., 32., 1.001))
for batch in batches:
texts, annotations = zip(*batch)
# print('texts: '+str(texts))
# print('annotations: '+str(annotations))
nlp.update(texts, annotations, sgd=optimizer, drop=0.2,losses=losses)
# with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data()
print('{0:.3f}' # print a simple table
.format(losses['textcat']))
# test the trained model
test_text = "This movie sucked"
doc = nlp(test_text)
print(test_text, doc.cats)
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc2 = nlp2(test_text)
print(test_text, doc2.cats)
if __name__ == '__main__':
main()
Run Code Online (Sandbox Code Playgroud)
我收到以下错误:
Traceback (most recent call last):
batch: [(u'1', {u'cats': u'sadness'}), (u' ', {u'cats': u'sadness'}), (u' ', {u'cats': u'enthusiasm'}), (u' ', {u'cats': u'neutral'})]
File "/Users/loginofdeath/Documents/24Feb/emo_cat.py", line 91, in <module>
main()
File "/Users/loginofdeath/Document/24Feb/emo_cat.py", line 63, in main
nlp.update(texts, annotations, sgd=optimizer, drop=0.2,losses=losses)
File "/usr/local/lib/python2.7/site-packages/spacy/language.py", line 399, in update
gold = GoldParse(doc, **gold)
File "gold.pyx", line 430, in spacy.gold.GoldParse.__init__
ValueError: dictionary update sequence element #0 has length 1; 2 is required
Run Code Online (Sandbox Code Playgroud)
我正在使用:Python 版本:2.7.14
平台:Darwin-16.4.0-x86_64-i386-64bit
SpaCy 版本:2.0.9
型号:en
有人能帮我吗?我在 spacy 中对多类分类的方法是否正确?提前致谢。
这个答案的全部功劳归功于Vikas Singh。这是下面的代码:
from __future__ import unicode_literals, print_function
from __future__ import unicode_literals
from pathlib import Path
import pandas as pd
import spacy
import copy
from spacy.util import minibatch, compounding
import re
def clean_string(mystring):
return re.sub('[^A-Za-z\ 0-9 ]+', '', mystring)
def main(model=None, output_dir=None, n_iter=2):
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank('en') # create blank Language class
print("Created blank 'en' model")
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
textcat = nlp.create_pipe('textcat')
nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
textcat = nlp.get_pipe('textcat')
# add label to text classifier
for i in ['neutral','worry','happiness','sadness','love','surprise','fun','relief','hate','enthusiasm','boredom','anger']:
textcat.add_label(i)
df = pd.read_csv('text_emotion.csv')
df.drop(['tweet_id', 'author'], axis=1, inplace=True)
df = df[df['sentiment'] != 'empty']
sentiment_values = df['sentiment'].unique()
labels_default = dict((v, 0) for v in sentiment_values)
train_data = []
for i, row in df.iterrows():
label_values = copy.deepcopy(labels_default)
label_values[row['sentiment']] = 1
train_data.append((unicode(clean_string(row['content'])), {"cats": label_values}))
train_data = train_data[:5000]
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes): # only train textcat
optimizer = nlp.begin_training()
print("Training the model...")
print('{:^5}\t'.format('LOSS'))
for i in range(n_iter):
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(train_data, size=compounding(4., 32., 1.001))
for batch in batches:
texts, annotations = zip(*batch)
# print('texts: '+str(texts))
# print('annotations: '+str(annotations))
nlp.update(texts, annotations, sgd=optimizer, drop=0.2,losses=losses)
# with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data()
print('{0:.3f}' # print a simple table
.format(losses['textcat']))
# test the trained model
test_text = "This movie sucked"
doc = nlp(test_text)
print(test_text, sorted(doc.cats.items(), key=lambda val: val[1], reverse=True))
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc2 = nlp2(test_text)
print(test_text, doc2.cats)
if __name__ == '__main__':
main()
Run Code Online (Sandbox Code Playgroud)
请注意,在代码中我们只训练了训练数据的 5000 个数据点。我希望这将清除有关 spaCy 中多类分类的大多数查询。
| 归档时间: |
|
| 查看次数: |
1733 次 |
| 最近记录: |