####################################################################################################
####################################################################################################
####################################################################################################
#
# intervalFrequencyCalculator
#
# generation of a frequency distribution for all genomic gain/loss segments specified in a standard
# segments file (see "segmentsReader" documentation for format)
# segments are not bound to any borders; instead, a non-overlapping segment distribution is
# generated from the segments input list
#
# parameters have to be provided as a list with the minimal information being the full path
# to the UCSC style cytoband file as well as the segments file itself
#
# © 2009-2010, Michael Baudis (www.progenetix.net)
# pls. use code with attribution
#
# 200100105
#
####################################################################################################
####################################################################################################
####################################################################################################

intervalFrequencyCalculator <- function(suppliedSettings) {
	localSettings=list(
		caseNumber=0,
		chromosomesToPlot=1:24,
		projectName='CGH project',
		segmentPercentsFileName='segmentFrequencyFile.txt'
	)
	localSettings <- modifyList(localSettings, suppliedSettings)
	attach(localSettings)

	segmentsFiltered <- segmentsReader(localSettings)
	if (caseNumber==0) caseNumber <- length(levels(as.factor(segmentsFiltered[,1])))
	project <- vector(mode="character")
	chromosome <- vector(mode="character")
	segStart <- vector(mode="numeric")
	segStop <- vector(mode="numeric")
	segType <- vector(mode="numeric")
	segValue <- vector(mode="numeric")

	derivedSegments <- NULL
	for (i in 1:length(chromosomesToPlot)) {
		currentChro <- chromosomesToPlot[i]
		segs <- subset(segmentsFiltered, currentChro==segmentsFiltered[,2])
		# all starts are generated from all starts + (all ends)+1
		segEdgeNumbers <- unique(sort(c(segs[,3], ((segs[,4])+1))))
		# since we have all ends "virtually" included here, we can re-generate them from the following start-1
		# the last end is non-existant in the original data, so we move until the 3rd value from the
		# end (original last start) with the interval ending at the 2nd last value -1
		for(i in 1:(length(segEdgeNumbers) - 2)) derivedSegments <- rbind(derivedSegments, c(currentChro, segEdgeNumbers[i], (segEdgeNumbers[i+1])-1))
	}
	for (j in 1:nrow(derivedSegments)) {
		currentChro <- derivedSegments[j,1]
		currentBaseStart <- derivedSegments[j,2]
		currentBaseStop <- derivedSegments[j,3]
		# getting all segments overlapping with the interval
		segs <- subset(segmentsFiltered, currentChro==segmentsFiltered[,2])
		segs <- subset(segs, currentBaseStart <= segs[,4])
		segs <- subset(segs, currentBaseStop >= segs[,3])
		gainSegs <- subset(segs, segs[,5] >= segGainThresh)
		gainSegCaseCount <- length(levels(as.factor(gainSegs[,1])))
		gainSegPercents <- round(gainSegCaseCount*100/caseNumber, digits = 1)
		if (gainSegPercents > 0) {
			project <- c(project, projectName)
			chromosome <- c(chromosome, currentChro)
			segStart <- c(segStart, currentBaseStart)
			segStop <- c(segStop, currentBaseStop)
			segType <- c(segType, 1)
			segValue <- c(segValue, gainSegPercents)
		}
		lossSegs <- subset(segs, segs[,5] <= segLossThresh)
		lossSegCaseCount <- length(levels(as.factor(lossSegs[,1])))
		lossSegPercents <- round(lossSegCaseCount*100/caseNumber, digits = 1)
		if (lossSegPercents > 0) {
			project <- c(project, projectName)
			chromosome <- c(chromosome, currentChro)
			segStart <- c(segStart, currentBaseStart)
			segStop <- c(segStop, currentBaseStop)
			segType <- c(segType, -1)
			segValue <- c(segValue, lossSegPercents)
		}
	}
	write.table(file=file.path(outputPath, segmentPercentsFileName), cbind(project, chromosome, segStart, segStop, segType, segValue), quote=F, sep="\t", row.names=F)
	detach(localSettings)
}

####################################################################################################
####################################################################################################
####################################################################################################