#!/bin/bash

# Copyright 2003 Sashidhar Gadiraju, Peter K. Rogan
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#

#Program : getchrfiles
#Version : 0.2
#Description:
#this script gets the human genome sequence fasta files and the annotation files from ucsc and unzips it
count=1
PRNAME=`echo $0 | tr "/" "\n" | tail -1`
SUFX=".fa.zip"	#fasta file suffix
MSUFX="_mrna.txt.gz"	#mrna annotation file suffix
BASEURL="ftp://genome.cse.ucsc.edu/goldenPath"

function usage
{
	if [ "$1" != "" ]; then
		echo "******************************"
		echo "$1"
		echo "******************************"
	fi
	echo 
	echo "usage: $PRNAME [-hU] UCSC_ASSEMBLY"
	echo "This program downloads the genome fasta files and unzips the "
	echo "respective annotations files based on the chromosome list "
	echo "specified in the file 'chrmlist'."
	echo "The UCSC_ASSEMBLY takes the value of a directory name listed in"
	echo "	$BASEURL"
	echo "This URL can be overrided with the -U option"
	echo "-h : this help message"
	echo "-U : Base_URL"
	echo "	Specify a URL for downloading the files" 
	echo "	Default: $BASEURL"
	echo "	The fasta files are downloaded from the URL"
	echo "		Base_URL/UCSC_ASSEMBLY/chromosomes"
	echo "	The annotation files are downloaded from the URL"
	echo "		Base_URL/UCSC_ASSEMBLY/database"
	echo 
	echo "Ex: To get the April 2003 human genome draft files, type at the prompt"
	echo "$PRNAME 10april2003 "
	echo "Ex: To get the Feb 2003 mouse genome draft files, type at the prompt"
	echo "$PRNAME mmFeb2003"
	echo "'10april2003' and 'mmFeb2003' are directories listed at"
	echo "	$BASEURL"
	
	
	exit
}

if [ $# -lt 1 ]; then
	usage
fi

while getopts ":hU:" Option
do
	case $Option in
		h ) usage;;
		U ) BASEURL="$OPTARG";;
		* ) usage "$1 option not implemented";;
	esac
done

#find the chromosome list
if [ ! -f chrmlist ]; then
	echo "Cannot find file chrmlist"
	exit
fi

echo "'wget'ting the files from $BASEURL"
URL=$BASEURL/$1/chromosomes
MURL=$BASEURL/$1/database

#remove comment lines(*) and empty lines from input and see the dir names
for count in `cat chrmlist | grep -v '^ *$\|^ *\*' |  awk '{ print $2}'` 
do
	echo "getting $count"
	wget $URL/chr${count}$SUFX
	wget $MURL/chr${count}$MSUFX
	unzip chr${count}$SUFX
	gunzip chr${count}$MSUFX
	#count=`expr $count + 1`
done
