Bash snippets that are useful when doing bioinformatic jobs.
These scripts are classified into different usage scenario. Some of them are easier to understand while not so efficient.
# get the nth line from a file, print a specific line from a file
head -n filename | tail -1
sed 'NUMq;d' filename
# add something to end of all header lines
sed 's/>.*/&SUFFIX/' file.fa > outfile.fa
#clean up a fasta file so only first column of the header remain
awk '{print $1}' file.fa > output.fa
# count the number of sequences in fasta file
grep -c '>' file.fa
grep '>' file.fa | wc -l
# add something to end of all header lines
sed 's/>.*/&SUFFIX/' file.fa > outfile.fa
# extract fasta file id
grep -o -E "^>\w+" file.fasta | tr -d ">"
# Return the lengths of all the sequences in a multifasta
falens(){
awk '/^>/ {if (seqlen){print seqlen}; print ;seqlen=0;next; } { seqlen += length($0)}END{print seqlen}' $1
}
# Remove duplicated fastas in a multifasta:
dedupe(){
cat $1 | awk '!_[$0]++'
}
# Merge a multifasta into a single fasta sequence (remove all but the first header, and deal with newlines):
fastcat(){
cat $1 | sed -e '1!{/^>.*/d;}' | sed ':a;N;$!ba;s/\n//2g'
}
# split a multifasta into separate files
splitfa(){
i=1;
while read line ; do
if [ ${line:0:1} == ">" ] ; then
header="$line"
echo "$header" >> seq"${i}".fasta
else
seq="$line"
echo "$seq" >> seq"${i}".fasta
((i++))
fi
done < $1
}
# get the nucleotide count for all sequences in a multi sequence file
echo -e "seq_id\tA\tU\tG\tC"; while read line; do echo $line | grep ">" | sed 's/>//g'; for i in A U G C;do echo $line | grep -v ">" | grep -o $i | wc -l | grep -v "^0"; done; done < your_fasta_file.fa | paste - - - - -
# linearize the complete fasta file
while read line; do
if [ "${line:0:1}" == ">" ]; then
echo -e "\n"$line;
else
echo $line | tr -d '\n' ;
fi;
done < input.fasta | sed '/^--$/d' > output.fasta
# or
awk '{if(NR==1) {print $0} else {if($0 ~ /^>/) {print "\n"$0} else {printf $0}}}' input.fasta > output.fasta
# and then pick up certain id with the following cmd
grep -A1 'Q15049' output.fasta