#include <stdlib.h>
#include <stdio.h>
#include <float.h>
#include <string.h>
#include <math.h>
#include <limits.h>

#include "pepAlignFunctions.h"
#include "pepAlignDefs.h"
#include "pepAlignGA.h"
#include "StringFunctions.h"
#include "pepXMLReader.h"
#include "PeptideProphetDelegate.h"
#include "mzXMLReader.h"

// Declarations of constants 
static unsigned char aa[20][5] =
{
	{5,3,1,1,0}, 
	{12,6,4,1,0},
	{6,4,2,2,0},
	{5,4,1,3,0},
	{5,3,1,1,1}, /* normal cysteine */
	{7,5,1,3,0},
	{8,5,2,2,0},
	{3,2,1,1,0},
	{7,6,3,1,0},
	{11,6,1,1,0},
	{11,6,1,1,0},
	{12,6,2,1,0},
	{9,5,1,1,1},
	{9,9,1,1,0},
	{7,5,1,1,0},
	{5,3,1,2,0},
	{7,4,1,2,0},
	{10,11,2,1,0},
	{9,9,1,2,0},
	{9,5,1,1,0}
};

// Declarations of monoo-isotopic masses
static double mimass[5] =
{
	1.0078250321,
	12,
	14.0030740052,
	15.9949146221,
	31.97207069
};


#ifdef WIN32
// round function, not needed on linux but in windows its undefined
static int diround(double input)
{
	return (int)(input+0.5);

}// static int round(double input)
#endif

// Function that finds the base name of a filename
char* get_base_name(char* input)
{
	int i;
	char* retval;

	for (i=strlen(input); i>=0; i--) {
		if (input[i] == '.')  {
			input[i] = '\0';
			retval = strclone(input);
			input[i] = '.';
			return retval;
		}// if
	}// for

	return NULL;

}// char* get_base_name(char* input)


/* Function that filters the parameters from command line input */
parameters* pepalign_filter_parameters(int argc, char *argv[])
{
	char temp[100];	/* temp name */
	char* p;		/* token pointer */
	int i;			/* string length */
	FILE* fint;		/* file tester */

	parameters* params = (parameters*) malloc(sizeof(parameters));
	init_parameters(params);
  
	/* Assuming first parameter is the program name */
	for(i=1; i<argc; i++) {	  
		if (argv[i][0] == '-') {	
			/* filtering out the parameter string */
			p = &argv[strlen(argv[i])>2? i: i+1][strlen(argv[i])>2? 2: 0];

			/* PepXMLfile specification plus alignment output standard */
			if (argv[i][1] == 'p') {				
				params->pepxml_file = strclone(p);								
			}/* if */
						
			/* mzXML file input specification */
			else if (argv[i][1] == 'm') {
				params->mzxml_file = strclone(p);
			}

			/* max mass measurement error */
			else if (argv[i][1] == 'e') 
				params->mmme = atof(p);

			/* Scan range */
			else if (argv[i][1]=='R') {
				strcpy(temp, p); 
				p = strtok(temp,","); 
				params->ms_start_scan = atoi(p); 
				p = strtok('\0',","); 
				params->ms_end_scan = atoi(p);			
			}// else if

			/* Peptide Range */
			else if (argv[i][1]=='P') {
				strcpy(temp, p); 
				p = strtok(temp,","); 
				params->pep_start_scan = atoi(p); 
				p = strtok('\0',","); 
				params->pep_end_scan = atoi(p);			
			}// else if

			/* Sigma declaration */
			else if (argv[i][1] == 'l')
				params->lc_sigma = atof(p);														

			/* Minimum score threshold */
			else if (argv[i][1] == 'L'){
				params->min_score_threshold = atof(p);				
			}

			/* Maximum score threshold */
			else if (argv[i][1] == 'U'){
				params->max_score_threshold = atof(p);				
			}

			/* Threshold parameter */
			else if (argv[i][1] == 't') {
				params->score_name = strclone(p);
			}

			/* Output file specification */
			else if (argv[i][1]=='o') {
				params->output_file_base = strclone(p);
			}
			
			/* Mode specifier */
			else if (argv[i][1]=='M') {
				if (strstr(p, "time"))
					params->alignmode = 1;		
			}/* else if */
  		}// if
	}// for

	if (params->pepxml_file) {
		fint = fopen(params->pepxml_file, "r");
		if (!fint) {
			printf("Error: Specified pepXML file does not exist.\n");
			return NULL;
		}// if
		fclose(fint);
	}// if

	if (params->mzxml_file) {
		fint = fopen(params->mzxml_file, "r");
		if (!fint) {
			printf("Error: Specified mzXML file does not exist.\n");
			return NULL;
		}// if
		fclose(fint);
	}// if
	
	// Checking integrity of the parameter file
	if (params->pepxml_file == NULL || params->mzxml_file == NULL || params->mmme < 0 || params->score_name == NULL)
		return NULL;	

	if (!params->output_file_base) {
		params->output_file_base = get_base_name(params->pepxml_file);		
	}// if

	return params;

}/* parameters* pepalign_filter_parameters(int argc, char *argv[]) */


/* Initialization routine for parameters struct */
void init_parameters(parameters* params)
{
	/* input file names */
	params->mzxml_file = NULL;
	params->pepxml_file = NULL;
	params->output_file_base = NULL;
	params->score_name = NULL;

	/* selection / filter input */
	params->ms_start_scan = INT_MIN;
	params->ms_end_scan = INT_MAX;		

	params->pep_start_scan = INT_MIN;
	params->pep_end_scan = INT_MAX;	
	
	params->mmme = -1;			
	params->lc_sigma = -1;
	params->min_score_threshold = DBL_MIN;
	params->max_score_threshold = DBL_MAX;
	params->alignmode = 0; 

}/* void init_parameters(parameters* params) */


/* Function that reads a pepxml file and returns an array of peptide_type */
peptide_type* read_peptides(pmsms_pipeline_analysis pepfile, parameters* params, int* pepnum)
{
	int i, j;
	long peptide_count = 0;
	float avgscan;
	peptide_type *peptide, *retval;
	spectrum_query sq;
	search_hit sh;

	// Counting the total number of peptides
	for (i=0; i<pepfile->run_summary_count; i++) {
		peptide_count += pepfile->run_summary_array[i].spectrum_query_count;		
	}// for

	printf("(PepXML file contains %i peptides)",peptide_count); fflush(stdout);

	*pepnum = 0;
	peptide = (peptide_type*) malloc(sizeof(peptide_type)*peptide_count);
	if (!peptide) {
		printf("Error: Memory allocation for peptides failed.\n");
		exit(-1);
	}// if

	/* First we quickly check the mode */
	
	/* Walking all spectrum results */
	for (i=0; i<pepfile->run_summary_count; i++) {
		for (j=0; j<pepfile->run_summary_array[i].spectrum_query_count; j++) {
			sq = pepfile->run_summary_array[i].spectrum_query_array[j];	/* ith search hit */
			sh = sq.search_result_array[0].search_hit_array[0];

			if (params->alignmode && sq.retention_time_sec < 0) {
				printf("Warning: time alignment mode not allowed due to missing retention time info in the pepXML file.\nSwitching to scan mode...\n");
				params->alignmode = 0;
			}/* if */

			avgscan = (sq.start_scan + sq.end_scan) / 2.0;

			/* Checking bounds of the peptide */
			if (avgscan < params->pep_start_scan || avgscan > params->pep_end_scan)
				continue;
			
			/* First we check if the peptide score is sufficient */
			if (!process_peptide(sh, params))
				continue;

			/* Found valid peptide */
			peptide[*pepnum].sequence = strclone(sh.peptide);
			peptide[*pepnum].protein = strclone(sh.protein);		
			peptide[*pepnum].scan = avgscan;	
			peptide[*pepnum].time = sq.retention_time_sec;	
			peptide[*pepnum].score = get_relevant_scores(sh, params);		

			/* Determining tmm score, in case of modified peptides */
			peptide[*pepnum].mz = (calculate_sequence_mass(sh.peptide, sh.modification_info_struct) + ((sq.assumed_charge)*HPLUS_MASS)) / (double)(sq.assumed_charge);
			*pepnum += 1;
		}// for
	}/* for */

	retval = (peptide_type*) malloc(sizeof(peptide_type)*(*pepnum));
	for (i=0; i<*pepnum; i++) {
		retval[i] = peptide[i];
	}/* for */
	free(peptide);

	return retval;

}/* peptide_type* read_pepxml_file(char* pepxml_file, int* pepnum) */


/* Function that filters out double peptides, keeps the one with the best score */
void unify_peptide_array(peptide_type* peparray, int* arraylen, parameters* params)
{	
	int ci, wi, found; /* compare index and free index, and walker index*/

	for(ci=0; ci<(*arraylen); ci++) {
		/* invalidated, finding valid copy, or bailing if there are none */
		if (!peparray[ci].sequence) {
			wi=ci+1;
			found = 0;
			while (wi<(*arraylen)){
				if (peparray[wi].sequence != NULL) {
					peparray[ci] = peparray[wi];
					peparray[wi].protein = NULL;
					peparray[wi].sequence = NULL;
					wi = (*arraylen);
					found = 1;
				}/* if */
				wi += 1;
			}/* while */

			/* end of the line */
			if (!found) {
				*arraylen = ci;				
				return;
			}/* if */
		}/* if */

		/* comparing other copies */
		for(wi=ci+1; wi<(*arraylen); wi++)	{ 	
			if (!peparray[wi].sequence)
				continue;

			/* upon double copies, we erase the sequence and protein */
			if (strcmp(peparray[ci].sequence, peparray[wi].sequence) == 0) {				
				free(peparray[wi].sequence);
				peparray[wi].sequence = NULL;
				free(peparray[wi].protein);
				peparray[wi].protein = NULL;
				
				/* Score checking. Upperbound set means monotonic decrease */			
				if (params->max_score_threshold != DBL_MAX && peparray[wi].score < peparray[ci].score) {		  
					peparray[ci].score = peparray[wi].score;
					peparray[ci].scan = peparray[wi].scan;
				}/* if */

				else if (params->min_score_threshold != DBL_MIN && peparray[wi].score > peparray[ci].score) {		  
					peparray[ci].score = peparray[wi].score;
					peparray[ci].scan = peparray[wi].scan;
				}/* else if */	    
			}/* if */
		}/* for */
	}/* for	*/

}/* void unify_peptide_array(peptide_type* peparray, int* arraylen, parameters* params) */


/* Function that generates the Data Summary File report */
void gen_dsf(peptide_type* peparray, double *SIC_max_mz, double *SIC_max_int, double *SIC_max_rt, int *SIC_max_scan, int arraylen, parameters* params)
{
	char tempname[1000];
	FILE* fout;
	int i; 
	
	strcpy(tempname, params->output_file_base);
	strcat(tempname, STD_DSF_EXT);
	fout = fopen(tempname, "w");
	
	fprintf(fout, "#protein\tpeptide\tm/z ratio\tAvg MS/MS scan\tMS/MS rt\tMS scan\tMS rt\tTotal intensity\tPeak m/z ratio\n");
	for(i=0; i<arraylen; i++) {
		fprintf(fout, "%s\t%s\t%1.6f\t%1.1f\t%1.6f\t%i\t%1.6f\t%1.4f\t%1.6f\n", peparray[i].protein,peparray[i].sequence, peparray[i].mz, peparray[i].scan, peparray[i].time, SIC_max_scan[i], SIC_max_rt[i], SIC_max_int[i], SIC_max_mz[i]);		
	}/* for	*/

	fclose(fout);

}// void gen_dsf(peptide_type* peparray, int* arraylen, parameters* params)


/* Calculates the mass of a sequence */
double calculate_sequence_mass(char *sequence, pmodification_info mod_info) 
{
	int a,b,i,mi;			/* loop- and indexvars */
    double result;			/* return value */
	int fragment[FRAGLEN];	/* fragment counter */
    
	/* initialization */
	for(i=0; i<FRAGLEN; i++) 
		fragment[i] = 0;

	/* setting the mod_info_counter */
	mi = 0;
	result = 0.0;
	 
	/* generate molecular formula for fragment */
	for(i=0; i<strlen(sequence); i++) {
		/* If the position matches the modified part, take that */
		if (mod_info && mi < mod_info->mod_aminoacid_mass_count && i+1 == mod_info->mod_aminoacid_mass_array[mi].position) {
			result += mod_info->mod_aminoacid_mass_array[mi].mass;
			mi += 1;
		}/* if */

		/* if no presets, then add the corresponding fragments */
		else {
			a = 20 - strlen(strchr(AMINO_ACIDS, sequence[i]));
		
			for(b=0; b<5; b++) 
				fragment[b] = fragment[b] + aa[a][b]; 
		}/* else */
	}/* for */
    
    fragment[0] += 2; 
	fragment[3] += 1;				/* add H2O */
  
    /* calculate integer mass based on molecular formula */           
	for(b=0; b<FRAGLEN; b++) 
		result += fragment[b] * mimass[b];

	return result;

}/* double calculate_sequence_mass(char *sequence, int* fragment, int fraglen) */


/* Function that determines if the peptide should be processed or not */
int process_peptide(search_hit sh, parameters* params)
{
	int process = 0, j;
	search_score ss;
	double* score = NULL;
	void* hookstruct;	

	for (j=0; j<strlen(sh.peptide); j++) {
		if (strchr(ANTI_ACIDS, sh.peptide[j])) {
			return 0;
		}// if
	}// if
	
	for (j=0; j<sh.search_score_count; j++) {
		ss = sh.search_score_array[j];

		if (!strstr(ss.name, params->score_name))
			continue;
		
		if (ss.value >= params->min_score_threshold && ss.value <= params->max_score_threshold) {
			process = 1;
		}/* if */
	}/* for */	

	/* None of the regular score measures applied, now looking for hooked ones from peptide prophet */
	if (!process) {
		for (j=0; j<sh.analysis_result_count; j++) {
			hookstruct = (void*) sh.analysis_result_array[j].hook;

			if (strstr(sh.analysis_result_array[j].analysis, "peptideprophet") && hookstruct)
				score = (double*) peptide_prophet_result_property(params->score_name, hookstruct);

			if (score && *score >= params->min_score_threshold && *score <= params->max_score_threshold) {
				process = 1;
			}/* if */
		}// for
	}// if

	return process;

}/* int process_peptide(search_hit sh, parameters* params) */


/* Function that retrieves the relavant score for the algorithm */
double get_relevant_scores(search_hit sh, parameters* params)
{
	search_score ss;	
	int j;
	double* score = NULL;
	void* hookstruct;
	
	for (j=0; j<sh.search_score_count; j++) {
		ss = sh.search_score_array[j];

		if (!strstr(ss.name, params->score_name))
			continue;

		return ss.value;											
	}/* for */	

	/* None of the regular score measures applied, now looking for hooked ones from peptide prophet */
	for (j=0; j<sh.analysis_result_count; j++) {
		hookstruct = (void*) sh.analysis_result_array[j].hook;

		if (strstr(sh.analysis_result_array[j].analysis, "peptideprophet") && hookstruct)
			score = (double*) peptide_prophet_result_property(params->score_name, hookstruct);

		if (score && *score >= params->min_score_threshold && *score <= params->max_score_threshold) {
			return *score;
		}/* if */
	}// for

	return DBL_MIN;

}/* double get_relevant_scores(psearch_result sr) */


/* Compares the peptides with the scan peaks and registers all data */
double match_pep_spectrum(scan_peaks peaks, peptide_type pep, double mmme, double* best_intensity, double* best_mz)
{
	int k;
	int found = 0, lbnd, ubnd, pos;
	double intensity_sum = 0.0; 
	double ppmdiff;

	// Initializing return parameters
	*best_intensity = 0;
	*best_mz = 0;

	// Checking all the peaks, in binary search	
	lbnd = 0;
	ubnd = peaks.count;	
	while (!found && lbnd < ubnd) {
		pos = lbnd + ((ubnd - lbnd) / 2);

		ppmdiff = ((fabs(peaks.mzs[pos] - pep.mz))/pep.mz)*1e6;
		if(ppmdiff < mmme) {
			intensity_sum = peaks.intensities[pos];
			*best_mz = peaks.mzs[pos];
			*best_intensity = peaks.intensities[pos];
			found = 1;
		}// if 
		else {
			if (peaks.mzs[pos] > pep.mz) {				
				ubnd = pos;
			}// if
			else {
				if (pos == lbnd)
					break;
				lbnd = pos;
			}// else
		}// if
	}// while

	if (found) {				
		// Checking the surroundings of the position
		k = pos;
		while (k>=0) {
			ppmdiff = ((fabs(peaks.mzs[k] - pep.mz))/pep.mz)*1e6;
			if(ppmdiff < mmme) {
				intensity_sum += peaks.intensities[k];
				if(peaks.intensities[k] > *best_intensity) {
					*best_mz = peaks.mzs[k]; 
					*best_intensity = peaks.intensities[k];
				}// if
				k--;
			}// if
			else
				break;
		}// while
		k = pos+1;
		while (k<peaks.count) {
			ppmdiff = ((fabs(peaks.mzs[k] - pep.mz))/pep.mz)*1e6;
			if(ppmdiff < mmme) {
				intensity_sum += peaks.intensities[k];
				if(peaks.intensities[k] > *best_intensity) {
					*best_mz = peaks.mzs[k]; 
					*best_intensity = peaks.intensities[k];
				}// if
				k++;
			}// if
			else
				break;
		}// while	
	}// if

	return intensity_sum;

}// double match_pep_spectrum(scan_peaks peaks, peptide_type pep, double mmme, double* best_intensity, double* best_mz)


/* Function that performs alignment on the peptide file, given the selected peptides */
double** align_peptides(pmzxml_file mzxmlf, peptide_type* peparray, int arraylen, parameters* params, int* nofpoints)
{
	int i, j, k, step;	
	double intensity_sum, scan_intensity_max, scan_mz_max, swap, xmax, ymax;
	double *SIC_max_mz, *SIC_max_int, *SIC_max_rt;
	int *SIC_max_scan, num_matched;
	double** data_points;
	double** best_points;
	scan_peaks peaks;
	pplf_type plf_best;
	   	   
	// Creating and initializing memory structures for each peptide
	SIC_max_int = (double*) malloc(arraylen*sizeof(double));
	SIC_max_mz = (double*) malloc(arraylen*sizeof(double));
	SIC_max_rt = (double*) malloc(arraylen*sizeof(double));
    SIC_max_scan = (int*) malloc(arraylen*sizeof(int));
	for(i=0; i<arraylen; i++) {
		SIC_max_int[i] = 0.0; 
		SIC_max_mz[i] = 0.0; 
		SIC_max_rt[i] = 0.0; 
		SIC_max_scan[i] = -1;
	}// for

	k=0;
	step = ((params->ms_end_scan - params->ms_start_scan) + 1)/10;
	printf("Loading predefined scan range...");


    // Going through all the MS lvl 1 scans
	for(i=params->ms_start_scan; i<=params->ms_end_scan; i++) {  
		if(mzxmlf->scan_array[i-1]->attributes.msLvl == 1) {	

			if (i%step == 0) {
				printf("%i%%...", k*10);
				k += 1;
			}// if

			if(mzxmlf->scan_array[i-1]->attributes.peakscount <= 0) 			  
				continue; /* skip spectrum */
		  
			/* Loading the peaks */
			peaks = load_scan_peaks(mzxmlf, i);
					  	      
			// Looping over all the unique peptides
			for(j=0; j<arraylen; j++) {
				intensity_sum = match_pep_spectrum(peaks, peparray[j], params->mmme, &scan_intensity_max, &scan_mz_max);
						  
				if(intensity_sum > SIC_max_int[j]) {
					SIC_max_int[j] = intensity_sum;
					SIC_max_mz[j] = scan_mz_max;
					SIC_max_scan[j] = i;
					SIC_max_rt[j] = mzxmlf->scan_array[i-1]->attributes.retentionTime; 
				}// if	
			}// for	    

			unload_scan_peaks(mzxmlf, i);
		}// if	  		
	}// for
	printf("done.");

	// Generating the data Survey Format
	gen_dsf(peparray, SIC_max_mz, SIC_max_int, SIC_max_rt, SIC_max_scan, arraylen, params);

	// Checking how many numbers have been matched, and if time should be used or scan numbers
	num_matched = 0;	
	for (i=0; i<arraylen; i++) {
		if (SIC_max_scan[i] >= 0) {
			num_matched++;
		}// if
	}// for

	// Filling the data-point array	   
	data_points = (double**)malloc(num_matched*sizeof(double*));
	for (i=0; i<num_matched; i++) {
		data_points[i] = (double*)malloc(2*sizeof(double));
	}// for
	j = 0;

	for(i=0; i<arraylen; i++) {			
		if (SIC_max_scan[i] >= 0) {			
			if (params->alignmode == 1) {
				data_points[j][1] = SIC_max_rt[i];
				data_points[j][0] = peparray[i].time;	  
			}
			else {
				data_points[j][1] = SIC_max_scan[i];
				data_points[j][0] = peparray[i].scan;	  
			}// else
			j++;
		}// if
	}// for    
	
	// Sorting the data-point array	
	for(i=0; i<num_matched; i++) {				  	  
		for(j=0; j<num_matched-1-i; j++) {
			if(data_points[j+1][0] < data_points[j][0]) {  
				swap = data_points[j][0];       
				data_points[j][0] = data_points[j+1][0];
				data_points[j+1][0] = swap;
				swap = data_points[j][1];       
				data_points[j][1] = data_points[j+1][1];
				data_points[j+1][1] = swap;
			}// if	      
		}// for	  
	}// for

	// Freeing data unused
	free(SIC_max_int);
	free(SIC_max_mz);
    free(SIC_max_scan);
	free(SIC_max_rt);

	// Getting the sigma if necessary, and the alignment 
	if (params->lc_sigma < 0)
		params->lc_sigma = calc_lcsigma(data_points, num_matched);
	plf_best = run_ga_alignment(data_points, num_matched, params->lc_sigma, params->output_file_base, &xmax, &ymax);
	
	*nofpoints = plf_best->nof_points;		
	best_points = (double**) malloc(sizeof(double*) * plf_best->nof_points);

	for (i=0; i<MAX_BREAKPOINTS; i++) {
		best_points[i] = (double*) malloc(sizeof(double)*2);
		best_points[i][0] = plf_best->xpoints[i];
		best_points[i][1] = plf_best->ypoints[i];
	}// for
		
	free(data_points);
	free(plf_best);	

	return best_points;
    
}// double** align_peptides(pmzxml_file mzxmlf, peptide_type* peparray, int arraylen, parameters* params, int* nofpoints)


/* Calculates a conservative sigma value */
double calc_lcsigma(double** data_points, int arraylen)
{
	double* scan_diff;
	double swap, scan_diff_median, sum_squared_deviation;
	int non_matched = 0;
	int i, j;

	scan_diff = (double*) malloc(arraylen * sizeof(double));

	for(i=0; i<arraylen-1; i++) {
		if((data_points[i][1]>-1) && (data_points[i+1][1]>-1)) 
			scan_diff[i] = data_points[i+1][1] - data_points[i][1]; 
		else {
			scan_diff[i] = 99999999; 
			non_matched += 1;
		}// else
	}// for
	  
	for(i=0; i<arraylen-1; i++) {
		for(j=0; j<arraylen-1-i; j++) {	      
			if(scan_diff[j+1] > scan_diff[j]) {  
				swap = scan_diff[j];       
				scan_diff[j] = scan_diff[j+1];
				scan_diff[j+1] = swap;
			}// if	      
		}// for	  
	}// for

	// Calculating Sigma
	scan_diff_median = scan_diff[(int)((arraylen+non_matched)/2)];
	sum_squared_deviation = 0;
	for(i=non_matched+diround((arraylen-non_matched)*0.2); i<arraylen-1-diround((arraylen-non_matched)*0.2); i++) {
		if(scan_diff[i]<99999998) {
			sum_squared_deviation+=(scan_diff[i]-scan_diff_median)*(scan_diff[i]-scan_diff_median); 
		}// if
	}// for
      
	return sqrt(sum_squared_deviation/(arraylen-2-non_matched));
}
