#
#  mSLCA.r
#  All rights reserved to SHINYA, TOQUE, and Kokuvo
#   
#  28 Jun 2007 : Start 
#  29 Jun 2007 : Add SearchMemory function to search list object
#  30 Jun 2007 : For Batch mode (add argms)
#

SharedLoci <- function(x,y)
#  Check how many locus that share at least one allele between two individuals
{
  shared = 0
  for ( s in 2:ncol(x)){
    if ((regexpr(substring(x[,s],1,1),y[,s]) >= 1) || (regexpr(substring(x[,s],2,2),y[,s]) >= 1)){
      shared <- shared + 1 
    }
  }
  if (shared == (ncol(x)-1)){
    return(2)
  }else{
    return(1)
  }
}

SearchMemory <- function(memory, elem)
# Search list(=memory) object if there is object (which you are looking for),
#  return TRUE  
{
  if (length(memory)!=0){
    for (i in 1:length(memory)){
      if (elem == memory[i]){return(TRUE)}
    }
    return(FALSE)
  }
  return(FALSE)
}


#####################################################
#   Main Routine
#####################################################

library(MASS)
args <- commandArgs()
if (length(args)!=5){
  stop("wrong number of arguments")
}
df <- read.table(args[5],header=F)
output <- paste("result",args[5],sep="-")
geneDistance <- matrix(0:0,length(df[[1]]),length(df[[1]]))
          # geneDistance is Distance matrix

#####################################################
#   Main Routine(Make Distance matrix)
#####################################################
for (clmn in 1:length(df[[1]])){           # clmn is column
   for (rw in 1:length(df[[1]])){           # rw is row      -> matrix[clmn,rw]
     geneDistance[clmn, rw] <- SharedLoci(df[clmn,],df[rw,])
   }
}
colnames(geneDistance) <- df[,1]
rownames(geneDistance) <- df[,1]

#####################################################
#   Main Routine(Correspondence analysis)
#####################################################
caScore <- as.vector(corresp(geneDistance)$rscore)

#####################################################
#   Main Routine
# (Sorting distance matrix according to score of CA)
#####################################################

geneDistance <- geneDistance[order(caScore,decreasing=F),order(caScore,decreasing=F)]
write.table(geneDistance,file=output)  ## <- output sorted matrix

#####################################################
#   Main Routine
# (Estimate sibling groups from clustered matrix)
#####################################################

memory <- as.list(NULL)
nFamily <- 0

sink(output,append=TRUE)
cat("\n\nEstimated sibling groups\n")
for (clmn in 1:ncol(geneDistance)){           # clmn is column
  gr <- as.vector(NULL)
  if (SearchMemory(memory, clmn)==FALSE){
    for (rw in 1:nrow(geneDistance)){           # rw is row -> matrix[clmn,rw]
      if (geneDistance[clmn,rw] == 2){
        if (SearchMemory(memory, rw)==FALSE){
          gr[length(gr)+1] <- rw
        }
        memory[length(memory)+1] <- rw
      }
    }
    if (length(gr)!=0){
      nFamily <- nFamily+1
      cat("Family", nFamily,": ")
      write.table(df[order(caScore,decreasing=F),1][gr],eol=" ",row.names=FALSE,col.names=FALSE)
      cat("\n")
    }
  }
}
cat("Total number of Families : ", nFamily,"\n")
sink()
