#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dirent.h>

#include "subread.h"
#include "HelperFunctions.h"
#include "seek-zlib.h"
#include "gene-algorithms.h"
#include "input-blc.h"


struct iBLC_scan_t{
	char out_format_string[MAX_FILE_NAME_LENGTH];
	char filter_format_string[MAX_FILE_NAME_LENGTH];
	int found_answer;
	int filter_is_gzipped;
	int bcl_is_gzipped;
	int reads_per_cluster;
	int read_lengths[INPUT_BLC_MAX_READS];
	int read_is_index[INPUT_BLC_MAX_READS];
};

int iBLC_guess_scan(struct iBLC_scan_t * scancon, char * data_dir ){
	DIR * this_level = opendir(data_dir);
	struct dirent *dp;
	int filter_found = 0, bcl_found = 0;
	char testfile_name[MAX_FILE_NAME_LENGTH];
	while ((dp = readdir (this_level)) != NULL) {
		if(dp -> d_type == DT_DIR && dp->d_name[0]!='.'){
			strcpy(testfile_name,data_dir);
			strcat(testfile_name, "/");
			strcat(testfile_name, dp->d_name);
			//SUBREADprintf("DIG: %s\n", testfile_name);
			if(iBLC_guess_scan( scancon, testfile_name))return -1;
		}else if(dp -> d_type == DT_REG){
			//SUBREADprintf( "%s  %s  %p  %p\n" , data_dir, dp->d_name , strstr( dp->d_name , "0001.bcl." ) , strstr( dp->d_name , ".bci") );
			if(0==strcmp(dp->d_name, "RunInfo.xml")){
				if(scancon->reads_per_cluster > 0){
					SUBREADprintf("ERROR: the root directory contains multiple scRNA data sets.\n");
					return -1;
				}

				strcpy(testfile_name, data_dir);    
				strcat(testfile_name, "/");
				strcat(testfile_name, dp->d_name);
				FILE *fp = fopen(testfile_name,"r");
				if(NULL == fp){
					SUBREADprintf("ERROR: cannot open the run info file: %s\n", testfile_name);
				}
				while(1){
					char inbuf[MAX_READ_LENGTH];
					if(!fgets( inbuf, MAX_READ_LENGTH-1, fp))break;
					if(strstr( inbuf, "<Read Number=\"" )){
						char * rbuf=NULL;
						int my_index = -1, is_idx = -1, rlen = -1, ii=0;
						strtok_r(inbuf, "\"", &rbuf);

						while(rbuf){
							char * sec = strtok_r(NULL, "\"", &rbuf);
							if(!sec) break;
							//printf("SEC %d : %s\n", ii, sec);
							if(ii == 0) my_index = atoi(sec);
							if(ii == 2) rlen = atoi(sec);
							if(ii == 4) is_idx = sec[0]=='Y';
							ii++;
						}
						assert(INPUT_BLC_MAX_READS>my_index);
						if(my_index >0 && is_idx >=0 && rlen>0){
							scancon -> read_lengths[my_index-1]=rlen;
							scancon -> read_is_index[my_index-1]=is_idx;
							scancon -> reads_per_cluster = max(scancon -> reads_per_cluster, my_index);
						}else assert( my_index >0 && is_idx >=0 && rlen>0 );
					}
				}
				fclose(fp);
				if(scancon -> reads_per_cluster <1){
					SUBREADprintf("ERROR: the format of RunInfo.xml is unknown\n");
					return -1;
				}
			}
			if(0==memcmp(data_dir+ strlen(data_dir)-5, "/L001",5 ) && strstr( dp->d_name , "s_1.filter")){
				autozip_fp tfp;
				strcpy(testfile_name, data_dir);    
				strcat(testfile_name, "/");
				strcat(testfile_name, dp->d_name);
				int resop = autozip_open(testfile_name, &tfp);
				if(0 <= resop){
					autozip_close(&tfp);
					char * gen_fmt = str_replace(dp->d_name , "s_1.filter", "s_%d.filter");
					char * gen_fmt2 = str_replace(data_dir , "/L001", "/L%03d");
					strcpy(scancon -> filter_format_string, gen_fmt2);
					strcat(scancon -> filter_format_string, "/");
					strcat(scancon -> filter_format_string, gen_fmt);
					free(gen_fmt2);
					free(gen_fmt);
					filter_found = resop + 1;
				}
			}
			if(0==memcmp(data_dir+ strlen(data_dir)-5, "/L001",5 ) && strstr( dp->d_name , "0001.bcl." ) && !strstr( dp->d_name , ".bci") ){
				int tti;
				bcl_found = 1;
				char * gen_fmt = str_replace(dp->d_name , "0001.bcl.", "%04d.bcl.");
				
				for(tti = 0; tti<22; tti++){
					strcpy(testfile_name, data_dir);	
					strcat(testfile_name, "/");
					sprintf(testfile_name+strlen(testfile_name), gen_fmt, 1, 2+tti);
					autozip_fp tfp;
					int resop = autozip_open(testfile_name, &tfp);
					//printf("%d === %s    %s\n", resop, gen_fmt, testfile_name);
					if(0<=resop){
						scancon -> bcl_is_gzipped = resop;
						autozip_close(&tfp);
					}else bcl_found=0;
				}
				if(bcl_found){
					char * gen_fmt2 = str_replace(data_dir , "/L001", "/L%03d");
					strcpy(scancon -> out_format_string, gen_fmt2);	
					free(gen_fmt2);
					strcat(scancon -> out_format_string, "/");
					strcat(scancon -> out_format_string, gen_fmt);
				}

				free(gen_fmt);
			}
		}
	}

	if(bcl_found && filter_found){
		scancon -> found_answer = 1;
		scancon -> filter_is_gzipped = filter_found - 1;
	}

	closedir(this_level);
	return 0;
}

int iBLC_guess_format_string(char * data_dir, int * cluster_bases, char * format_string, char * filter_format, int * bcl_is_gzipped, int * filter_is_gzipped, int * read_lens, int * is_index){
	struct iBLC_scan_t sct;
	memset(&sct, 0, sizeof(sct));
	int tii = iBLC_guess_scan(&sct, data_dir);

	if(tii || ! sct.found_answer) return -1;
	strcpy(format_string, sct.out_format_string);
	strcpy(filter_format, sct.filter_format_string);
	*filter_is_gzipped = sct.filter_is_gzipped;
	*bcl_is_gzipped = sct.bcl_is_gzipped;
	*cluster_bases=0;

	for(tii=0; tii<sct.reads_per_cluster; tii++){
		if(sct.read_lengths[tii]<1) return -1;
		read_lens[tii] = sct.read_lengths[tii];
		is_index[tii] = sct.read_is_index[tii];
		(*cluster_bases) += sct.read_lengths[tii];
		read_lens[tii+1]=0;
	}
		
	return 0;
}

void iBLC_close_batch(input_BLC_t * blc_input){
	int ii;
	if(NULL == blc_input -> bcl_gzip_fps && blc_input -> bcl_is_gzipped)return;
	if(NULL == blc_input -> bcl_fps && !blc_input -> bcl_is_gzipped)return;
	for(ii=0; ii < blc_input->total_bases_in_each_cluster; ii++){
		if(blc_input -> bcl_is_gzipped){
			seekgz_close(blc_input -> bcl_gzip_fps[ii]);
			free(blc_input -> bcl_gzip_fps[ii]);
			blc_input -> bcl_gzip_fps[ii] = NULL;
		}else{
			fclose(blc_input -> bcl_fps[ii]);
			blc_input -> bcl_fps[ii] = NULL;
		}
	}
	if(blc_input -> filter_is_gzipped){
		seekgz_close(blc_input -> filter_gzip_fp);
		free(blc_input -> filter_gzip_fp);
		blc_input -> filter_gzip_fp = NULL;
	}else{
		fclose(blc_input -> filter_fp);
		blc_input -> filter_fp = NULL;
	}
}

int iBLC_open_batch(input_BLC_t * blc_input ){
	char fname[MAX_FILE_NAME_LENGTH];
	iBLC_close_batch(blc_input);
	int fii, xx;

	if(blc_input -> bcl_gzip_fps == NULL) blc_input -> bcl_gzip_fps = calloc( sizeof(void *), blc_input -> total_bases_in_each_cluster ); // for both FILE** and seekgz **
	for(fii = 0; fii < blc_input -> total_bases_in_each_cluster; fii++){
		sprintf(fname, blc_input -> bcl_format_string, blc_input -> current_lane, fii+1);
		printf("OPEN_BCL %s\n", fname);
		if(blc_input -> bcl_is_gzipped){
			blc_input -> bcl_gzip_fps[fii] = calloc( sizeof(seekable_zfile_t), 1);
			int rv = seekgz_open(fname, blc_input -> bcl_gzip_fps[fii], NULL);
			if(rv){
				SUBREADprintf("ERROR: Unable to open %s\n", fname);
				return -1;
			}
			for(xx = 0; xx < 4; xx++) seekgz_next_int8(blc_input -> bcl_gzip_fps[fii]); // skip the first 32-b integer
		}else{
			blc_input -> bcl_fps[fii] = fopen(fname, "r");
			 if(NULL == blc_input -> bcl_fps[fii]){
				SUBREADprintf("ERROR: Unable to open %s\n", fname);
				return -1;
			}
			for(xx = 0; xx < 4; xx++) fgetc(blc_input -> bcl_fps[fii]); // skip the first 32-b integer
		}
	}

	sprintf(fname, blc_input -> filter_format_string, blc_input -> current_lane,blc_input -> current_lane);
	printf("OPEN_FLTR %s\n", fname);
	if(blc_input -> filter_is_gzipped){
		blc_input -> filter_gzip_fp = calloc( sizeof(seekable_zfile_t), 1);
		int rv = seekgz_open(fname, blc_input -> filter_gzip_fp, NULL);
		if(rv){
			SUBREADprintf("ERROR: Unable to open %s\n", fname);
			return -1;
		}
		for(xx = 0; xx < 12; xx++) seekgz_next_int8(blc_input -> filter_gzip_fp); // skip the 12-byte header
	}else{
		blc_input -> filter_fp = fopen(fname, "r");
		if(NULL == blc_input -> filter_fp){
			SUBREADprintf("ERROR: Unable to open %s\n", fname);
			return -1;
		}
		for(xx = 0; xx < 12; xx++) fgetc(blc_input -> filter_fp); // skip the 12-byte header
	}
	return 0;
}

int input_BLC_init( input_BLC_t * blc_input , char * data_dir ){
	memset(blc_input, 0, sizeof(input_BLC_t));
	subread_init_lock(&blc_input -> read_lock);

	int rv = iBLC_guess_format_string(data_dir, &blc_input -> total_bases_in_each_cluster, blc_input -> bcl_format_string, blc_input -> filter_format_string, &blc_input -> bcl_is_gzipped, &blc_input -> filter_is_gzipped, blc_input -> single_read_lengths, blc_input -> single_read_is_index);
	if(rv) return -1;

	blc_input -> current_lane = 1;

	return iBLC_open_batch(blc_input)?1:0;
}

// load the next read W/O switch lane. 
// Return 0 if EOF, -1 if error or bases if loaded correctly.
int iBLC_current_lane_next_read(input_BLC_t * blc_input, char * readname , char * read, char * qual){
	int bii, idx_offset, base_offset;

	sprintf(readname, "R%011llu:", blc_input -> read_number +1);

	{
		idx_offset = blc_input -> single_read_lengths[0];
		base_offset = idx_offset + blc_input -> single_read_lengths[1];
	}

	readname[13+idx_offset]='|';
	readname[14+2*idx_offset]='|';
	readname[15+base_offset+idx_offset]='|';
	readname[16+2*base_offset]=0;

	while(1){
		int fch = blc_input -> filter_is_gzipped? seekgz_next_int8(blc_input -> filter_gzip_fp) :fgetc(blc_input -> filter_fp);
		if(fch < 0) return 0;
		int baseii =0;
		for(bii =0; bii< blc_input -> total_bases_in_each_cluster; bii++){
			int nch = blc_input -> bcl_is_gzipped?seekgz_next_int8(blc_input -> bcl_gzip_fps[bii]):fgetc(blc_input -> bcl_fps[bii]), bv, qv;
			assert(nch >=0 && nch <=255);

			if(0==nch){
				bv='N';
				qv='#';
			}else{
				bv="ACGT"[nch%4];
				qv=33+(nch>>2);
			}
			if(bii < idx_offset){
				readname[13+ bii]=bv;
				readname[14+ bii+idx_offset]=qv;
			}else if(bii < base_offset){
				readname[15+ bii+ idx_offset]=bv;
				readname[16+ bii+ base_offset]=qv;
			}else{
				read[baseii] = bv;
				qual[baseii] = qv;
				baseii++;
			}
		}
		assert(fch==1||fch==0);
		if(fch==1){
			blc_input -> read_number ++;
			return baseii;
		}
	}
}

int iBLC_inc_lane(input_BLC_t * blc_input){
	blc_input -> current_lane ++;
	return iBLC_open_batch(blc_input); // this function automatically closes BCL fps and FILTER fp.
}

// return : -1: error, 0: end of all files and all lanes, >0: actual read is loaded (return the read len). The read name is the combination of the short-end and the index-end.
// NOTE: this only works with scRNA protocol!!
int input_BLC_next_read(input_BLC_t * blc_input , char * readname, char * read, char * qual){
	int nextlane, rrv=0;

	subread_lock_occupy(&blc_input -> read_lock);
	for(nextlane = 0; nextlane <2; nextlane++){
		int rv = iBLC_current_lane_next_read(blc_input, readname, read, qual);
		if(rv >0 || rv <0){
			rrv = rv;
			break;
		}
		if(rv ==0 && nextlane){
			rrv = 0;
			break;
		}
		if(nextlane>0){
			rrv = -1;
			break;
		}
		
		rv = iBLC_inc_lane(blc_input);
		if(rv){
			rrv = 0;
			break;
		}
	}
	subread_lock_release(&blc_input -> read_lock);
	return rrv;
}

int input_BLC_tell ( input_BLC_t * blc_input , input_BLC_pos_t * pos ){
	int xx1;
	memset(pos,0, sizeof(*pos));
	pos -> lane_id = blc_input -> current_lane;
	pos -> read_number = blc_input -> read_number;
	if(blc_input->bcl_is_gzipped){
		pos -> pos_of_bclgzs = calloc(sizeof(void *) , blc_input -> total_bases_in_each_cluster);
		for(xx1=0; xx1<blc_input -> total_bases_in_each_cluster; xx1++){
			pos -> pos_of_bclgzs[xx1] = malloc(sizeof(seekable_position_t));
			seekgz_tell(blc_input->bcl_gzip_fps[xx1], pos -> pos_of_bclgzs[xx1]);
		}
	}else{
		pos -> pos_of_bcls = calloc(sizeof(long long) , blc_input -> total_bases_in_each_cluster);
		for(xx1=0; xx1<blc_input -> total_bases_in_each_cluster; xx1++)
			pos -> pos_of_bcls[xx1] = ftello(blc_input->bcl_fps[xx1]);
	}


	if(blc_input->filter_is_gzipped){
		pos -> pos_of_filtergz = malloc(sizeof(seekable_position_t));
		seekgz_tell(blc_input->filter_gzip_fp, pos -> pos_of_filtergz);
	}else pos -> pos_of_filter = ftello(blc_input->filter_fp);

	return 0;
}

int input_BLC_seek ( input_BLC_t * blc_input , input_BLC_pos_t * pos ){
	int xx1;
	blc_input -> read_number = pos -> read_number;
	if(pos -> lane_id != blc_input -> current_lane){
		blc_input -> current_lane = pos -> lane_id;
		iBLC_open_batch(blc_input);
	}

	for(xx1=0; xx1<blc_input -> total_bases_in_each_cluster; xx1++)
		if(blc_input->bcl_is_gzipped) seekgz_seek(blc_input->bcl_gzip_fps[xx1], pos -> pos_of_bclgzs[xx1]); 
		else fseeko(blc_input->bcl_fps[xx1], pos -> pos_of_bcls[xx1], SEEK_SET);

	if(blc_input->filter_is_gzipped) seekgz_seek(blc_input->filter_gzip_fp, pos -> pos_of_filtergz);
	else fseeko(blc_input->filter_fp, pos -> pos_of_filter, SEEK_SET);
	
	return 0;
}

void input_BLC_destroy_pos(input_BLC_t * blc_input , input_BLC_pos_t *pos){
	int xx1;
	for(xx1=0; xx1<blc_input -> total_bases_in_each_cluster; xx1++){
		if(blc_input->bcl_is_gzipped) free(pos -> pos_of_bclgzs[xx1]);
	}
	free((blc_input->bcl_is_gzipped? (void*)pos -> pos_of_bclgzs:(void*)pos -> pos_of_bcls));
}

void input_BLC_close(input_BLC_t * blc_input){
	iBLC_close_batch(blc_input);
	subread_destroy_lock(&blc_input -> read_lock);
}

#ifdef MAKE_TEST_IBLC
int main(int argc, char ** argv){
	assert(argc>1);
	input_BLC_t blc_input;
	input_BLC_pos_t poses[10];
	int orv = input_BLC_init(&blc_input, argv[1]), total_poses = 0;
	printf("orv=%d, bases=%d, filter_gzip=%d, data_gzip=%d\n", orv, blc_input.total_bases_in_each_cluster, blc_input.filter_is_gzipped, blc_input.bcl_is_gzipped);

	while(1){
		char base[1000], qual[1000], rname[200];
		base[0]=qual[0]=rname[0]=0;
		orv = input_BLC_next_read(&blc_input, rname, base, qual);
		assert(orv>=0);
		if(0==orv) break;
		if(blc_input.read_number%1000000==1)printf("%s %s %s\n", rname, base, qual);
		if(blc_input.read_number % 20000000 == 1){
			input_BLC_tell(&blc_input, poses + total_poses);
			total_poses++;
		}
		if(blc_input.read_number > 80000010) break;
	}

	int ii;
	for(ii = 0; ii < total_poses; ii++){
		long long jj;
		printf("\n\n========== SEEKING %d OF %d ================\n", ii,total_poses);
		input_BLC_seek( &blc_input, poses+ii );
		input_BLC_destroy_pos( &blc_input, poses+ii );
		for(jj = 0 ; jj < 3000011; jj++){
			char base[1000], qual[1000], rname[200];
			orv = input_BLC_next_read(&blc_input, rname, base, qual);
			assert(orv>=0);
			if(blc_input.read_number%1000000==2)printf("%s %s %s\n", rname, base, qual);
		}
	} 
	
	printf("END CORRECTLY  R=%llu\n", blc_input.read_number);
	input_BLC_close(&blc_input);
	return 0;
}
#endif
