package algorithms;

import things.RatingsMemory;
import utilities.VectorTools;

/**
 * Simple nuclear norm regularized matrix factorization by Hazan's algorithm,
 * where here we are using the squared loss (l2).
 * 
 * This is the version we used for the experiments in the ICML paper
 * http://www.icml2010.org/papers/196.pdf .
 * See also http://m8j.net/(All)Matrix%20Factorizations%20and%20SDP
 * for more information, and talk slides.
 * 
 * This uses no additional memory to store the factorization (used memory = size
 * of training set + size of test set).
 * 
 * The input data-structure, here called "RatingsMemory", needs to be a
 * sequential list of just the observed entries of the matrix that we want to
 * complete. E.g. for the Netflix dataset, there are 10^8 such observed entries.
 * 
 * Each entry in the list consists of the following 4 numbers:
 *  - [byte] rating                   (1 to 5 stars),
 *  - [int] the corresponding user    (or the matrix-row of this entry),
 *  - [short] the corresponding movie (or the matrix-column of this entry),
 *  - [float] prediction              (here we store our current low-rank 
 *                                     prediction for the particular entry)
 * 
 * We assume the training entries are followed by the test entries, and
 * contained in the same list.
 * 
 * @author martin jaggi
 * 
 */
public class NetflixHazanICML {

    // the ratings, stored sequentially. every rating contains: movie, user, rating, prediction
    private RatingsMemory ratings;
    
    // the trace regularization parameter
    private double scalingfactor;
    
    private double previousEigenvalue;
    private int numMatrixMultiplications;
    private long starttime;

    public NetflixHazanICML(RatingsMemory ratings) {
        this.ratings = ratings;
    }

    /**
     * Implements the algorithm, with some implementation variants as
     * specified in the parameters, see also the paper for more details
     * on the different variants.
     * 
     * Here we use the power method as the internal eigenvector procedure,
     * but you can replace that by any other method of your choice.
     * 
     * @param numSteps
     *            number of steps the algorithm should perform. This equals the
     *            size (rank) of the resulting factorization.
     * @param trace
     *            the scaling factor (trace of the matrices from our optimization 
     *            domain)
     * @param powerStepsSlope
     *            number of steps performed by the power method is 1 +
     *            powerStepsSlope * k.
     * @param addToDiagonalFactor
     *            fraction of the previous Eigenvalue to be added to the
     *            diagonal, to ensure stable convergence of the power method.
     * @param interpolateGradient
     *            if you want to use immediate feedback in the power method (use
     *            zero to switch this off)
     * @param bestOnLineSegment
     *            Do line search for the best step size. This maintains the
     *            convergence guarantee, at a small additional cost of computing
     *            this step-size (the cost corresponds to about a single
     *            matrix-vector multiplication per iteration, which is small)
     */
    public void run(int numSteps, double trace,
                    double powerStepsSlope, double addToDiagonalFactor,
                    double interpolateGradient, boolean bestOnLineSegment) {
        scalingfactor = trace;
        numMatrixMultiplications = 0;
		starttime = System.currentTimeMillis();
        previousEigenvalue = scalingfactor;

        System.out.println("Running NetflixHazanICML on "+ratings.numTrainingRatings+" training ratings");
        System.out.println("   numUsers="+ratings.numUsers+", numMovies="+ratings.numMovies);
        System.out.println("will perform "+numSteps+" steps (factors)");
        System.out.println("*** scalingfactor (trace) is "+scalingfactor);
        System.out.println("*** powerStepsSlope is "+powerStepsSlope+", addToDiagonalFactor is "+addToDiagonalFactor);
        System.out.println("*** interpolateGradient is "+interpolateGradient+", bestOnLineSegment is "+bestOnLineSegment);

        int matrixSize = ratings.numUsers + ratings.numMovies;

        int failedSteps = 0;
        double[] p;

        // do k rounds, each resulting in adding a new rank-1 term to the approximation
        for (int k = 1; k <= numSteps; k++) {

            // report more detailed statistics for the steps already done
            if ((k-1)%5 == 0)
                reportStatistics(k-1);

            int numPowerSteps = (int)(1+powerStepsSlope*k); 

            p = runPowerMethodForSparseGradientMatrix(numPowerSteps, matrixSize, addToDiagonalFactor, interpolateGradient, k);
            // This approximate Eigenvector is our new rank-1 estimate. Note that it is a length one vector (=> trace 1 matrix)

            double lambda; // step size

            if (!bestOnLineSegment) {
                lambda = 0.5 / k; // default step size
            } else {
                // Calculate the best point on the line segment
                // (otherwise we just use 1/k as the step size).
                // In any case, this lambda is a very informative measure of the
                // quality of the eigenvector you have obtained!
                double num = 0, denom = 0;
                for (int i = 0; i < ratings.numTrainingRatings; i++) {
                    double oldValue = ratings.getPrediction(i);
                    int user = ratings.getUser(i);
                    int movie = ratings.getMovie(i);
                    // Obtain the corresponding entry of the new rank 1 matrix
                    double newValue = p[user] * p[ratings.numUsers + movie]; 

                    num += ((ratings.getRating(i) / scalingfactor) - oldValue)*(newValue - oldValue);
                    denom += (newValue - oldValue)*(newValue - oldValue);
                }
                lambda = num / denom;
                if (lambda<=0) {
                    System.out.println("Eigenvector fails to improve!!! So it was not " +
                                       "calculated accurately enough.");
                    lambda = 0.005 / k;  // We must chose some positive step size anyway, 
                                         // as otherwise we would possibly loose the PSD property!
                    failedSteps ++;
                } else if (lambda>=1) {
                    System.out.println("Hit step!!! We can set all existing factors to " +
                                       "zero and improve by changing to this new and better rank-1 " +
                                       "estimate.");
                    lambda = 1;
                } else {
                    System.out.println("best 1/lambda = "+(1/lambda)+" and previous " +
                                       "eigenvalue was "+previousEigenvalue);
                }
            }

            // Update the predictions. This needs to update both the training AND test positions!
            for (int i = 0; i < ratings.numTrainingRatings+ratings.numTestRatings; i++) {
                double oldValue = ratings.getPrediction(i);
                int user = ratings.getUser(i);
                int movie = ratings.getMovie(i);
                // Obtain the corresponding entry of the new rank-1 matrix
                double newValue = p[user] * p[ratings.numUsers + movie];

                ratings.setPrediction(i, (1.0 - lambda) * oldValue + lambda * newValue);
            }
        }

        reportStatistics(numSteps);
        System.out.println("****************************************");
        System.out.println("percentage of failed steps is "+(failedSteps/(double)numSteps));
    }

    /**
     * Run the iterative Power method for the sparse gradient matrix \Nabla f =
     * 1/sumZ * z_i * ith constraint matrix.
     * 
     * @param numSteps
     * @param matrixSize
     * @param addToDiagonalFactor
     *            fraction of the previous Eigenvalue to be added to the
     *            diagonal, to ensure stable convergence of the power method.
     * @param interpolateGradient
     *            if this is set, we will use as the power matrix the mean of
     *            the gradient before and after the step.
     * @param k
     *            just needed if interpolation is wanted, to get the step size
     * @return
     */
    private double[] runPowerMethodForSparseGradientMatrix(int numSteps, int matrixSize,
                                                           double addToDiagonalFactor, double interpolateGradient, int k) {
         // start with a uniform unit vector
        double[] v = VectorTools.uniformUnitVectorA(matrixSize);

        double[] q;

        double addToDiagonal = addToDiagonalFactor * previousEigenvalue;

        double scalarProductWithPrevious = 0;
        int powerStep = 0;
        while (powerStep < numSteps) {
            numMatrixMultiplications++;

            // multiply p with the \Nabla f matrix
            q = new double[matrixSize];
            double observationValue, X_i, matrixEntry;
            int row, column;

            for (int i = 0; i < ratings.numTrainingRatings; i++) {
                // calculate contribution of each training observation to the matrix product with our vector
                row = ratings.getUser(i);                           // user
                column = ratings.numUsers + ratings.getMovie(i);    // movie + offset

                observationValue = ratings.getRating(i);

                if (interpolateGradient == 0) {
                    X_i = scalingfactor * ratings.getPrediction(i);
                } else {
                    // we interpolate by the mean of the gradient before and after the step
                    // interpolateGradient == 1 means only the new gradient is used,
                    // 0 means only the old, and 0.5 is the mean of the two
                    double lambda_interpolate = interpolateGradient / k; // if interpolateGradient==0.5, this is
                                                                         // half the lambda that would be used to do an actual 1/k step
                    double currentFeatureValue = v[row] * v[column];
                    X_i = scalingfactor * (
                              (1.0 - lambda_interpolate) * ratings.getPrediction(i)
                            + lambda_interpolate * currentFeatureValue  // tight feedback in Simon Funk style
                          );
                }

                // contribution at the position of X_i of the gradient
                matrixEntry = observationValue - X_i;

                q[row] += matrixEntry * v[column];

                // contribution at the position of X_i transposed of the gradient
                q[column] += matrixEntry * v[row];
            }

            // add a constant to the diagonal to ensure convergence to largest EV, not smallest
            if (addToDiagonal>0)
                q = VectorTools.linCombWithB(1, q, addToDiagonal, v);


            // a weak but interesting measure of progress:
            scalarProductWithPrevious = VectorTools.scalarProduct(v, q);

            v = q;
            VectorTools.normalize(v);

            powerStep ++;
        }
        // Averaging over the previous and the second previous obtained eigenvalue
        previousEigenvalue = (Math.abs(scalarProductWithPrevious) + previousEigenvalue) / 2.0;
        return v;
    }

    private void reportStatistics(int k) {
        double trainE = ratings.RMSEtrain(scalingfactor), testE = ratings.RMSEtest(scalingfactor);
        System.out.println("************");
        System.out.println("step k="+k+"     RMSE on the training set is: "+trainE+", and on test set is: "+testE);
        System.out.println("    approximation is "+ratings.someTrainingPredictionsToString(10, scalingfactor));
        System.out.println("    elapsed time is "+((System.currentTimeMillis()-starttime)/1000.0)+" seconds, done "+numMatrixMultiplications+" matrix multiplications.");
    }

    /**
     * Main method
     * 
     * @param args
     *            usage: first argument is either a filename of an existing
     *            memory (text of obj file)
     * 
     *            the second parameter is the scalingconstant. this will be
     *            equal to the trace regularization parameter.
     * 
     *            the third (optional) parameter is the powerStepsSlope
     *            (default 0.2)
     * 
     *            the fourth (optional) parameter is the fraction of the
     *            previous Eigenvalue to be added to the diagonal, to ensure
     *            stable convergence of the power method.
     * 
     *            the fifth (optional) parameter is the number of steps to
     *            perform
     */
    public static void main(String[] args) {
        RatingsMemory mem = null;
        if (args.length >= 1) {
            // text input format, or if the users/movies are already at no gaps
            mem = RatingsMemory.readTextFileFromDisk("datasets/txt/"+args[0]);
        }

        // Set the scaling factor, or in other words the trace regularization
        double trace = (args.length>=2) ? Double.valueOf(args[1]) : 10000;

        // The number of power steps to perform in iteration k will be k * powerStepsSlope
        double powerStepsSlope = (args.length>=3) ? Double.valueOf(args[2]) : 0.2;
        
        double addToDiagonalFactor = (args.length>=4) ? Double.valueOf(args[3]) : 0.5;
        
        int numSteps = (args.length >= 5) ? Integer.valueOf(args[4]) : 200;

        // Whether we should incorporate immediate feedback during the power iterations.
        // if set to zero, no interpolation will be made.
        double interpolateGradient = 0.5;
        // Moving to the best point on the line segment, instead of just 1/k. 
        boolean bestOnLineSegment = true;

        // Run the algorithm
        new NetflixHazanICML(mem).run(numSteps, trace,
                                      powerStepsSlope, addToDiagonalFactor,
                                      interpolateGradient, bestOnLineSegment);
    }

}