错误:IndexError:索引 6319 超出尺寸为 0 的轴 0 的范围

huy*_*huy 4 python numpy keras tensorflow

下面的代码取自https://github.com/anarn2/HierarchicalAttentionNetworks/blob/master/HierarchicalAttn.py,并进行了一些细微的调整。尽管我理解该错误的含义,但我无法弄清楚它是如何在以下代码中蔓延以及如何纠正它的。我已经被困在这个问题上很长一段时间了,非常感谢一些帮助。谢谢!

(这是整个代码)

maxlen = 100
max_sentences = 15
max_words = 20000
embedding_dim = 100
validation_split = 0.2
reviews = []
labels = []
texts = []
glove_dir = "./glove.6B"
embeddings_index = {}


# class defining the custom attention layer
class HierarchicalAttentionNetwork(Layer):
    def __init__(self, attention_dim):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(HierarchicalAttentionNetwork, self).__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim,)))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(HierarchicalAttentionNetwork, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, attention_dim]
        # size of u :[batch_size, attention_dim]
        # uit = tanh(xW+b)
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))

        ait = K.exp(K.squeeze(K.dot(uit, self.u), -1))

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        weighted_input = x * K.expand_dims(ait)
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]


def remove_html(str_a):
    p = re.compile(r'<.*?>')
    return p.sub('', str_a)


# replace all non-ASCII (\x00-\x7F) characters with a space
def replace_non_ascii(str_a):
    return re.sub(r'[^\x00-\x7f]', r'', str_a)


# Tokenization/string cleaning for dataset
def clean_str(string):
    string= string.decode("utf-8")
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()



input_data = pd.read_csv(io.BytesIO(uploaded['labeledTrainData.tsv']), sep='\t')

for idx in range(input_data.review.shape[0]):
    text = BeautifulSoup(input_data.review[idx], features="html5lib")
    text = clean_str(text.get_text().encode('ascii', 'ignore'))
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)
    np.append(labels, input_data.sentiment[idx])

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

data = np.zeros((len(texts), max_sentences, maxlen), dtype='int32')

for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j < max_sentences:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < maxlen and tokenizer.word_index[word] < max_words:
                    data[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

if np.any(np.array(labels)):
    labels = np_utils.to_categorical(np.array(labels))
#labels = to_categorical(np.asarray(labels))

print('Shape of reviews (data) tensor:', data.shape)
print('Shape of sentiment (label) tensor:', np.shape(labels))

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = np.asarray(labels)[indices.astype(int)]
#labels = labels[indices]
nb_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in training and validation set')
print (y_train.sum(axis=0))
print (y_val.sum(axis=0))


f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix],
                            input_length=maxlen, trainable=True, mask_zero=True)

sentence_input = Input(shape=(maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
lstm_word = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
attn_word = HierarchicalAttentionNetwork(100)(lstm_word)
sentenceEncoder = Model(sentence_input, attn_word)

review_input = Input(shape=(max_sentences, maxlen), dtype='int32')
review_encoder = TimeDistributed(sentenceEncoder)(review_input)
lstm_sentence = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
attn_sentence = HierarchicalAttentionNetwork(100)(lstm_sentence)
preds = Dense(2, activation='softmax')(attn_sentence)
model = Model(review_input, preds)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

print("model fitting - Hierachical attention network")
model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=10, batch_size=100)
Run Code Online (Sandbox Code Playgroud)

完整错误: 错误堆栈跟踪

jcc*_*ero 6

Python 正在抱怨,因为您尝试通过索引访问labels数组,但它是空的,如控制台输出中所示:

Shape of sentiment (label) tensor: (0,)
Run Code Online (Sandbox Code Playgroud)

问题出在这一行:

Shape of sentiment (label) tensor: (0,)
Run Code Online (Sandbox Code Playgroud)

在您引用的原始代码中,一个新值被附加到labels list. 这个变化发生在原地,就是修改了原来的list。相反,如numpy 文档中所示,在描述 的返回值时np.append,为arr原始数组:

arr值附加到轴的副本。请注意,这append不会就地发生:分配并填充一个新数组。

即,您的原始labels list空数组永远不会被修改,这会在代码稍后尝试按索引访问数组时导致错误。

如果你想实现类似的行为,你需要像这样修改你的代码:

np.append(labels, input_data.sentiment[idx])
Run Code Online (Sandbox Code Playgroud)

请注意,由于所解释的原因,此操作效率非常低,最好将情感结果直接附加到原始结果,labels list如原始代码所示:

labels = np.append(labels, input_data.sentiment[idx])
Run Code Online (Sandbox Code Playgroud)

请同时参阅此相关的 SO 问题。