####################################################################################################
####################################################################################################
####################################################################################################
#
# intervalFrequencyCalculator
#
# generation of a frequency distribution for all genomic gain/loss segments specified in a standard
# segments file (see "segmentsReader" documentation for format)
# segments are not bound to any borders; instead, a non-overlapping segment distribution is
# generated from the segments input list
#
# parameters have to be provided as a list with the minimal information being the full path
# to the UCSC style cytoband file as well as the segments file itself
#
# © 2009-2010, Michael Baudis (www.progenetix.net)
# pls. use code with attribution
#
# 200100105
#
####################################################################################################
####################################################################################################
####################################################################################################
intervalFrequencyCalculator <- function(suppliedSettings) {
localSettings=list(
caseNumber=0,
chromosomesToPlot=1:24,
projectName='CGH project',
segmentPercentsFileName='segmentFrequencyFile.txt'
)
localSettings <- modifyList(localSettings, suppliedSettings)
attach(localSettings)
segmentsFiltered <- segmentsReader(localSettings)
if (caseNumber==0) caseNumber <- length(levels(as.factor(segmentsFiltered[,1])))
project <- vector(mode="character")
chromosome <- vector(mode="character")
segStart <- vector(mode="numeric")
segStop <- vector(mode="numeric")
segType <- vector(mode="numeric")
segValue <- vector(mode="numeric")
derivedSegments <- NULL
for (i in 1:length(chromosomesToPlot)) {
currentChro <- chromosomesToPlot[i]
segs <- subset(segmentsFiltered, currentChro==segmentsFiltered[,2])
# all starts are generated from all starts + (all ends)+1
segEdgeNumbers <- unique(sort(c(segs[,3], ((segs[,4])+1))))
# since we have all ends "virtually" included here, we can re-generate them from the following start-1
# the last end is non-existant in the original data, so we move until the 3rd value from the
# end (original last start) with the interval ending at the 2nd last value -1
for(i in 1:(length(segEdgeNumbers) - 2)) derivedSegments <- rbind(derivedSegments, c(currentChro, segEdgeNumbers[i], (segEdgeNumbers[i+1])-1))
}
for (j in 1:nrow(derivedSegments)) {
currentChro <- derivedSegments[j,1]
currentBaseStart <- derivedSegments[j,2]
currentBaseStop <- derivedSegments[j,3]
# getting all segments overlapping with the interval
segs <- subset(segmentsFiltered, currentChro==segmentsFiltered[,2])
segs <- subset(segs, currentBaseStart <= segs[,4])
segs <- subset(segs, currentBaseStop >= segs[,3])
gainSegs <- subset(segs, segs[,5] >= segGainThresh)
gainSegCaseCount <- length(levels(as.factor(gainSegs[,1])))
gainSegPercents <- round(gainSegCaseCount*100/caseNumber, digits = 1)
if (gainSegPercents > 0) {
project <- c(project, projectName)
chromosome <- c(chromosome, currentChro)
segStart <- c(segStart, currentBaseStart)
segStop <- c(segStop, currentBaseStop)
segType <- c(segType, 1)
segValue <- c(segValue, gainSegPercents)
}
lossSegs <- subset(segs, segs[,5] <= segLossThresh)
lossSegCaseCount <- length(levels(as.factor(lossSegs[,1])))
lossSegPercents <- round(lossSegCaseCount*100/caseNumber, digits = 1)
if (lossSegPercents > 0) {
project <- c(project, projectName)
chromosome <- c(chromosome, currentChro)
segStart <- c(segStart, currentBaseStart)
segStop <- c(segStop, currentBaseStop)
segType <- c(segType, -1)
segValue <- c(segValue, lossSegPercents)
}
}
write.table(file=file.path(outputPath, segmentPercentsFileName), cbind(project, chromosome, segStart, segStop, segType, segValue), quote=F, sep="\t", row.names=F)
detach(localSettings)
}
####################################################################################################
####################################################################################################
####################################################################################################