具有反向传播的神经网络不收敛

Question

具有反向传播的神经网络不收敛

A15*_*34B 2 java backpropagation neural-network

基本上我试图backpropogation在网络中实现。我知道反向传播算法是硬编码的，但我试图首先使其发挥作用。

它适用于一组输入和输出，但超出一个训练集，网络收敛于一个解决方案，而另一输出收敛于 0.5。

即一次试验的输出是： [0.9969527919933012, 0.003043774988797313]

[0.5000438200377985, 0.49995612243030635]

Network.java

private ArrayList<ArrayList<ArrayList<Double>>> weights;
private ArrayList<ArrayList<Double>> nodes;

private final double LEARNING_RATE = -0.25;
private final double DEFAULT_NODE_VALUE = 0.0;

private double momentum = 1.0;

public Network() {
    weights = new ArrayList<ArrayList<ArrayList<Double>>>();
    nodes = new ArrayList<ArrayList<Double>>();
}

/**
 * This method is used to add a layer with {@link n} nodes to the network.
 * @param n number of nodes for the layer
 */
public void addLayer(int n) {
    nodes.add(new ArrayList<Double>());
    for (int i = 0;i < n;i++)
        nodes.get(nodes.size()-1).add(DEFAULT_NODE_VALUE);
}

/**
 * This method generates the weights used to link layers together.
 */
public void createWeights() {
    // there are only weights between layers, so we have one less weight layer than node layer
    for (int i = 0;i < nodes.size()-1;i++) {
        weights.add(new ArrayList<ArrayList<Double>>());

        // for each node above the weight
        for (int j = 0;j < nodes.get(i).size();j++) {
            weights.get(i).add(new ArrayList<Double>());

            // for each node below the weight
            for (int k = 0;k < nodes.get(i+1).size();k++)
                weights.get(i).get(j).add(Math.random()*2-1);
        }
    }
}

/**
 * Utilizes the differentiated sigmoid function to change weights in the network
 * @param out   The desired output pattern for the network
 */
private void propogateBackward(double[] out) {
    /*
     * Error calculation using squared error formula and the sigmoid derivative
     * 
     * Output Node : dk = Ok(1-Ok)(Ok-Tk)
     * Hidden Node : dj = Oj(1-Oj)SummationkEK(dkWjk)
     * 
     * k is an output node
     * j is a hidden node
     * 
     * dw = LEARNING_RATE*d*outputOfpreviousLayer(not weighted)
     * W = dW + W
     */

    // update the last layer of weights first because it is a special case

    double dkW = 0;

    for (int i = 0;i < nodes.get(nodes.size()-1).size();i++) {

        double outputK = nodes.get(nodes.size()-1).get(i);
        double deltaK = outputK*(1-outputK)*(outputK-out[i]);

        for (int j = 0;j < nodes.get(nodes.size()-2).size();j++) {
            weights.get(1).get(j).set(i, weights.get(1).get(j).get(i) + LEARNING_RATE*deltaK*nodes.get(nodes.size()-2).get(j) );
            dkW += deltaK*weights.get(1).get(j).get(i);
        }
    }

    for (int i = 0;i < nodes.get(nodes.size()-2).size();i++) {

        //Hidden Node : dj = Oj(1-Oj)SummationkEK(dkWjk)
        double outputJ = nodes.get(1).get(i);
        double deltaJ = outputJ*(1-outputJ)*dkW*LEARNING_RATE;

        for (int j = 0;j < nodes.get(0).size();j++) {
            weights.get(0).get(j).set(i, weights.get(0).get(j).get(i) + deltaJ*nodes.get(0).get(j) );
        }


    }

}

/**
 * Propogates an array of input values through the network
 * @param in    an array of inputs
 */
private void propogateForward(double[] in) {
    // pass the weights to the input layer
    for (int i = 0;i < in.length;i++)
        nodes.get(0).set(i, in[i]);

    // propagate through the rest of the network
    // for each layer after the first layer
    for (int i = 1;i < nodes.size();i++)

        // for each node in the layer
        for (int j = 0;j < nodes.get(i).size();j++) {

            // for each node in the previous layer
            for (int k = 0;k < nodes.get(i-1).size();k++)

                // add to the node the weighted output from k to j
                nodes.get(i).set(j, nodes.get(i).get(j)+weightedNode(i-1, k, j));

            // once the node has received all of its inputs we can apply the activation function
            nodes.get(i).set(j, activation(nodes.get(i).get(j)));

        }   
}

/**
 * This method returns the activation value of an input
 * @param   in the total input of a node
 * @return  the sigmoid function at the input
 */
private double activation(double in) {
    return 1/(1+Math.pow(Math.E,-in));
}

/**
 * Weighted output for a node.
 * @param layer the layer which the transmitting node is on
 * @param node  the index of the transmitting node
 * @param previousNode  the index of the receiving node
 * @return  the output of the transmitting node times the weight between the two nodes
 */
private double weightedNode(int layer, int node, int nextNode) {
    return nodes.get(layer).get(node)*weights.get(layer).get(node).get(nextNode);
}

/**
 * This method resets all of the nodes to their default value
 */
private void resetNodes() {
    for (int i = 0;i < nodes.size();i++)
        for (int j = 0;j < nodes.get(i).size();j++)
            nodes.get(i).set(j, DEFAULT_NODE_VALUE);
}

/**
 * Teach the network correct responses for certain input values.
 * @param in    an array of input values
 * @param out   an array of desired output values
 * @param n     number of iterations to perform
 */
public void train(double[] in, double[] out, int n) {
    for (int i = 0;i < n;i++) {
        propogateForward(in);
        propogateBackward(out);
        resetNodes();
    }
}

public void getResult(double[] in) {
    propogateForward(in);
    System.out.println(nodes.get(2));
    resetNodes();
}

Run Code Online (Sandbox Code Playgroud)

SnapSolve.java

public SnapSolve() {

    Network net = new Network();
    net.addLayer(2);
    net.addLayer(4);
    net.addLayer(2);
    net.createWeights();

    double[] l = {0, 1};
    double[] p = {1, 0};

    double[] n = {1, 0};
    double[] r = {0, 1};

    for(int i = 0;i < 100000;i++) {
        net.train(l, p, 1);
        net.train(n, r, 1);
    }

    net.getResult(l);
    net.getResult(n);

}

public static void main(String[] args) {
    new SnapSolve();
}

Run Code Online (Sandbox Code Playgroud)

Answer 1

lmj*_*ns3 5

建议

您在网络中使用的初始权重非常大。通常，您希望在 sigmoid 激活神经网络中初始化权重，该权重与单元扇入平方根的倒数成正比。因此，对于网络第 i 层中的单元，在正负 n^{-1/2} 之间选择初始权重，其中 n 是第 i-1 层中的单元数。（有关更多信息，请参见http://www.willamette.edu/~gorr/classes/cs449/precond.html。）
您似乎使用的学习率参数也相当大，这可能会导致您的网络在训练期间“反弹”。我会为此尝试不同的值，在对数刻度上：0.2, 0.1, 0.05, 0.02, 0.01, 0.005, ... 直到你找到一个看起来效果更好的值。
您实际上只是在训练两个示例（尽管您使用的网络应该能够轻松地对这两点进行建模）。您可以通过向现有输入添加噪声并期望网络产生正确输出来增加训练数据集的多样性。我发现这有时在使用平方误差损失（就像您正在使用）并尝试学习像 XOR 这样的二元布尔运算符时会有所帮助，因为在真正的函数域中很少有输入-输出对可以训练.

监控

此外，我想提出一个一般性建议，可能有助于您解决此类问题：添加一点代码，当给定已知的输入-输出对（或整个“验证”数据集）。

如果你能在训练过程中监控网络的误差，它会帮助你更清楚地看到网络何时收敛——当你训练网络时，误差应该会稳步下降。如果它四处反弹，您就会知道您要么使用了太大的学习率，要么需要以其他方式调整您的训练数据集。如果误差增加，则说明您的梯度计算有问题。

归档时间：	12 年前
查看次数：	3713 次
最近记录：	12 年前