Differences

This shows you the differences between two versions of the page.

--- scripts:adapter_and_quality_trimming [2025/03/09 07:35] – created 37.26.174.181
+++ scripts:adapter_and_quality_trimming [2025/03/25 10:37] (current) – 37.26.174.181
@@ Line 1: / Line 1: @@
-====== How to perform quality and adapter trimming of your reads ======
+====== How to perform quality control and adapter trimming of your reads with cutadapt and fastp ======
 ===== What to trim =====
@@ Line 161: / Line 161: @@
 src/cutadapt.sh $input_dir $output_dir $trim_5p $trim_3p $threads $paired
+</code>
+===== Example script with Fastp =====
+==== Daughter script ====
+This script takes the adapters, number of bases to trim, and number of threads for parallel processing as input and generates fastp reports (html, json). It operates in single-end and paired-end mode. Please note, it is important to treat your paired-end files in parallel mode, otherwise, some reads may be removed from one file, and stay in the other, which will lead to problems during alignment.
+The script will search for **Illumina Universal Adapter**. You can modify the script to select from another two available in it (just set the one you choose as the ''adapter'') or you can add yours.  It also trims polyG and polyX tails, low complexity reads. There is also ''**-****-**detect_adapter_pe'' option, which can automatically detect adapters for paired end reads (just remove #).
+**fastp.sh**
+<code>
+# ------------------------------------------
+# SCRIPT TO CUT ADAPTERS AND TRIM LOW-QUALITY READS
+# Processes FASTQ files using `fastp` to:
+#   1. Remove adapter sequences.
+#   2. Trim low-quality bases (Q < 20) from ends of reads.
+#   3. Discard reads shorter than 20 bp after trimming.
+#   4. Works for both single-end and paired-end reads.
+# ------------------------------------------
+# Input parameters
+input_dir="$1"
+output_dir="$2"
+trim_5p="$3"      # Number of bases to trim from the 5' end
+trim_3p="$4"      # Number of bases to trim from the 3' end
+threads="$5"      # Number of threads
+paired="$6"       # Set to "true" for paired-end processing
+# ------------------------------------------
+# DEFAULT ADAPTER SETS
+# ------------------------------------------
+# Illumina universal adapter (most common)
+illumina_universal_adapter="AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
+# Illumina Nextera adapter
+nextera_adapter="CTGTCTCTTATACACATCT"
+# TruSeq adapter
+truseq_adapter="AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"
+# Use the Illumina universal adapter by default
+adapter="$illumina_universal_adapter"
+# ------------------------------------------
+# VALIDATION
+# ------------------------------------------
+if [[ -z "$input_dir" || -z "$output_dir" || -z "$trim_5p" || -z "$trim_3p" || -z "$threads" ]]; then
+    echo "Usage: sbatch fastp_trimming.sh <input_dir> <output_dir> <trim_5p> <trim_3p> <threads> [paired]"
+    exit 1
+fi
+# Create output directory if it doesn't exist
+mkdir -p "$output_dir"
+# Log the command execution
+echo "Script initiated with command: $0 $@"
+echo "Using default adapter: $adapter"
+echo "Trimming $trim_5p bases from the 5' end and $trim_3p bases from the 3' end"
+# ------------------------------------------
+# PROCESS FILES WITH FASTP
+# ------------------------------------------
+if [[ "$paired" == "true" ]]; then
+    # ---------- PAIRED-END MODE ----------
+    echo "Running in paired-end mode..."
+    for file1 in "$input_dir"/*_1.{fastq,fq}.gz "$input_dir"/*.R1.{fastq,fq}.gz; do
+        if [[ -f "$file1" ]]; then
+            file2="${file1/_1/_2}"
+            file2="${file2/R1/R2}"
+            if [[ -f "$file2" ]]; then
+                filename=$(basename "$file1" | sed 's/_1\.fastq\.gz//;s/_1\.fq\.gz//;s/\.R1\.fastq\.gz//;s/\.R1\.fq\.gz//')
+                echo "Processing paired-end files: $file1 and $file2 using $threads threads..."
+                # Run fastp for paired-end
+                fastp --in1 "$file1" --in2 "$file2" \
+                      --out1 "$output_dir/${filename}_1.fq.gz" \
+                      --out2 "$output_dir/${filename}_2.fq.gz" \
+                      --trim_poly_g --trim_poly_x \
+                      --low_complexity_filter \
+                      --qualified_quality_phred 20 \
+                      --length_required 20 \
+                      --thread "$threads" \
+                      --trim_front1 "$trim_5p" --trim_tail1 "$trim_3p" \
+                      --trim_front2 "$trim_5p" --trim_tail2 "$trim_3p" \
+                      --adapter_sequence "$adapter" \
+                      #--detect_adapter_pe \
+                      --html "$output_dir/${filename}_fastp.html" \
+                      --json "$output_dir/${filename}_fastp.json"
+                echo "Finished processing paired-end files: $filename"
+            else
+                echo "Paired file not found for $file1"
+            fi
+        fi
+    done
+else
+    # ---------- SINGLE-END MODE ----------
+    echo "Running in single-end mode..."
+    for file in "$input_dir"/*.{fastq,fq}.gz; do
+        if [[ -f "$file" ]]; then
+            filename=$(basename "$file" | sed 's/\.fastq\.gz/\.fq.gz/;s/\.fq\.gz/\.fq.gz/')
+            echo "Processing single-end file: $filename using $threads threads..."
+            # Run fastp for single-end
+            fastp --in1 "$file" \
+                  --out1 "$output_dir/$filename" \
+                  --trim_poly_g --trim_poly_x \
+                  --low_complexity_filter \
+                  --qualified_quality_phred 20 \
+                  --length_required 20 \
+                  --thread "$threads" \
+                  --trim_front1 "$trim_5p" --trim_tail1 "$trim_3p" \
+                  --adapter_sequence "$adapter" \
+                  --html "$output_dir/${filename}_fastp.html" \
+                  --json "$output_dir/${filename}_fastp.json"
+            echo "Finished processing single-end file: $filename"
+        fi
+    done
+fi
+# Completion message
+echo "All files processed. Output saved in $output_dir."
+</code>
+==== Parent script ====
+**fastp_00.sh**
+<code>
+#!/bin/bash
+#SBATCH --mem=32gb
+#SBATCH --cpus-per-task=10
+#SBATCH --job-name=fastp_00
+#SBATCH --output=log/fastp_00%j.log  # %j will be replaced with the job ID
+# Parameters (example, modify as needed)
+input_dir=fq_renamed
+output_dir=fq_trimmed
+trim_5p=10	# Number of bases to trim from the 5' end
+trim_3p=0	# Number of bases to trim from the 3' end
+threads=10	# Number of threads
+paired=true	# Set to "true" for paired-end processing
+src/fastp.sh $input_dir $output_dir $trim_5p $trim_3p $threads $paired
 </code>