Bash snippets

Jan. 3, 2021

Bash snippets that are useful when doing bioinformatic jobs.

These scripts are classified into different usage scenario. Some of them are easier to understand while not so efficient.

General usage

# get the nth line from a file, print a specific line from a file
head -n filename | tail -1
sed 'NUMq;d' filename

Fasta file manipulation


# add something to end of all header lines
sed 's/>.*/&SUFFIX/' file.fa > outfile.fa

#clean up a fasta file so only first column of the header remain
awk '{print $1}' file.fa > output.fa

# count the number of sequences in fasta file
grep -c '>' file.fa
grep '>' file.fa | wc -l

Unfiled


# add something to end of all header lines
sed 's/>.*/&SUFFIX/' file.fa > outfile.fa

# extract fasta file id
grep -o -E "^>\w+" file.fasta | tr -d ">"

# Return the lengths of all the sequences in a multifasta

falens(){
awk '/^>/ {if (seqlen){print seqlen}; print ;seqlen=0;next; } { seqlen += length($0)}END{print seqlen}' $1
}

# Remove duplicated fastas in a multifasta:

dedupe(){
cat $1 | awk '!_[$0]++'
}

# Merge a multifasta into a single fasta sequence (remove all but the first header, and deal with newlines):

fastcat(){
cat $1 | sed -e '1!{/^>.*/d;}' | sed  ':a;N;$!ba;s/\n//2g'
}

# split a multifasta into separate files
splitfa(){
i=1;
while read line ; do
  if [ ${line:0:1} == ">" ] ; then
    header="$line"
    echo "$header" >> seq"${i}".fasta
  else
    seq="$line"
    echo "$seq" >> seq"${i}".fasta
    ((i++))
  fi
done < $1
}

# get the nucleotide count for all sequences in a multi sequence file
echo -e "seq_id\tA\tU\tG\tC"; while read line; do echo $line | grep ">" | sed 's/>//g'; for i in A U G C;do echo $line | grep -v ">" | grep -o $i | wc -l | grep -v "^0"; done; done < your_fasta_file.fa | paste - - - - -

# linearize the complete fasta file
while read line; do
  if [ "${line:0:1}" == ">" ]; then 
    echo -e "\n"$line;
  else 
    echo $line | tr -d '\n' ; 
  fi; 
done < input.fasta | sed '/^--$/d' > output.fasta

# or 
awk '{if(NR==1) {print $0} else {if($0 ~ /^>/) {print "\n"$0} else {printf $0}}}' input.fasta > output.fasta

# and then pick up certain id with the following cmd
grep -A1 'Q15049' output.fasta