如何将sklearn Pipeline与自定义功能一起使用?

iva*_*lan 4 python pipeline classification machine-learning scikit-learn

我正在使用Python和sklearn进行文本分类。除了矢量化程序外,我还有一些自定义功能。我想知道是否可以将它们与sklearn Pipeline一起使用以及如何将功能堆叠在其中。

我目前没有管道的分类代码的简短示例。请告诉我,如果您发现其中有任何错误,将非常感谢您的帮助。是否可以通过某种方式在sklearn管道中使用它?我创建了自己的函数get_features(),该函数提取自定义功能,转换矢量化程序,缩放功能并最终将所有功能堆叠在一起。

import sklearn.svm
import re
from sklearn import metrics
import numpy
import scipy.sparse
import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.preprocessing import StandardScaler

# custom feature example
def words_capitalized(sentence):
    tokens = []
    # tokenize the sentence
    tokens = word_tokenize(sentence)

    counter = 0
    for word in tokens:

        if word[0].isupper():
            counter += 1

    return counter

# custom feature example
def words_length(sentence):
    tokens = []
    # tokenize the sentence
    tokens = word_tokenize(sentence)

    list_of_length = list()
    for word in tokens:
        list_of_length.append(length(word))

    return list_of_length

def get_features(untagged_text, value, scaler):

    # this function extracts the custom features
    # transforms the vectorizer
    # scales the features
    # and finally stacks all of them

    list_of_length = list()
    list_of_capitals = list()

    # transform vectorizer
    X_bow = countVecWord.transform(untagged_text)

    # I also see some people use X_bow = countVecWord.transform(untagged_text).todense(), what does the .todense() option do here?

    for sentence in untagged_text:
        list_of_urls.append([words_length(sentence)])
        list_of_capitals.append([words_capitalized(sentence)])

    # turn the feature output into a numpy vector
    X_length = numpy.array(list_of_urls)
    X_capitals = numpy.array(list_of_capitals)

    if value == 1:
        # fit transform for training set
        X_length = = scaler.fit_transform(X_length)
        X_capitals = scaler.fit_transform(X_capitals)
    # if test set
    else:
        # transform only for test set
        X_length = = scaler.transform(X_length)
        X_capitals = scaler.transform(X_capitals)

    # stack all features as a sparse matrix
    X_two_bows = scipy.sparse.hstack((X_bow, X_length))
    X_two_bows = scipy.sparse.hstack((X_two_bows , X_length))
    X_two_bows = scipy.sparse.hstack((X_two_bows , X_capitals))

    return X_two_bows

def fit_and_predict(train_labels, train_features, test_features, classifier):

    # fit the training set
    classifier.fit(train_features, train_labels)

    # return the classification result
    return classifier.predict(test_features)

if  __name__ == '__main__':

    input_sets = read_data()

    X = input_sets[0] 
    Y = input_sets[1] 
    X_dev = input_sets[2] 
    Y_dev = input_sets[3] 

    # initialize the count vectorizer
    countVecWord = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1, 3))

    scaler= StandardScaler()

    # extract features

    # for training
    X_total = get_features(X, 1, scaler)

    # for dev set
    X_total_dev = get_features(X_dev,  2, scaler)

    # store labels as numpy array
    y_train = numpy.asarray(Y)
    y_dev = numpy.asarray(Y_dev)

    # train the classifier
    SVC1 = LinearSVC(C = 1.0)

    y_predicted = list()
    y_predicted = fit_and_predict(y_train, X_total, X_total_dev, SVC1)

    print "Result for dev set"
    precision, recall, f1, _ = metrics.precision_recall_fscore_support(y_dev, y_predicted)
    print "Precision: ", precision, " Recall: ", recall, " F1-Score: ", f1
Run Code Online (Sandbox Code Playgroud)

我知道有FeatureUnion,但是我不知道它是否可以用于我的目的以及是否可以缩放和堆叠这些功能。

编辑:这似乎是一个好的开始:https : //michelleful.github.io/code-blog/2015/06/20/pipelines/

还没有尝试过,会在我发布时发布。现在的问题是,如何使用管道进行特征选择。

iva*_*lan 5

对于感兴趣的任何人,自定义要素类都需要具有fit和transform函数,然后可以在FeatureUnion中使用。有关详细示例,请在此处查看我的其他问题> 如何将不同的输入适合sklearn管道?