Differences

This shows you the differences between two versions of the page.

--- scripts:download_fastq [2025/03/26 07:23] – 37.26.174.181
+++ scripts:download_fastq [2025/08/06 08:38] (current) – 37.26.174.181
@@ Line 7: / Line 7: @@
 The easiest way to download FASTQ files from SRA is using the fastq-dump command from the SRA Toolkit with the command:
-''fastq-dump --gzip --split3 SRR[accession ID]''
+''fastq-dump --gzip --split-3 SRR[accession ID]''
 Options:
@@ Line 23: / Line 23: @@
 <code>
+#!/bin/bash
 # SCRIPT FOR DOWNLOADING SRA FILES USING THE NCBI SRA TOOLKIT
-# NOTE: Run this script from the directory where the "log" and "meta" directories are located
+# NOTE: Run this script from the directory where the "log" directory is located
 #
 # PURPOSE:
 #   This script reads SRA accession IDs from a given file (one per line)
 #   and downloads each corresponding SRA file using fastq-dump.
-#   The script outputs the SRA accession IDs that failed to be downloaded in a .txt file (one per line)
 #
 # PARAMETERS:
@@ Line 37: / Line 38: @@
 #
 # SAMPLE USAGE:
-#   sbatch src/download_sra.sh <output_directory> <accession_file>
+#   sbatch src/fq_download.sh <output_directory> <accession_file>
 #
 # IMPORTANT:
-#   - This script downloads files using fastq-dump, gzips and splits paired-end reads
+#   - This script downloads files using fastq-dump, gzips and splits paired-end reads (it does nothing to single-end read).
-#     (it does nothing to single-end reads).
 #   - Ensure that the SRA Toolkit is installed and available.
@@ Line 63: / Line 63: @@
 mkdir -p "$outdir"
-# Define the meta directory and create it if it doesn't exist.
+# Function to download an SRA accession using fastq-dump (without split/gzip)
-meta_dir="meta"
-mkdir -p "$meta_dir"
-# Create (or empty) the failed downloads file in the meta directory
-failed_file="$meta_dir/failed_sra.txt"
-> "$failed_file"  # Truncate or create the file
-# Define the log file with a fallback if SLURM_JOB_ID is not set
-log_file="log/download_sra_retry${SLURM_JOB_ID:-manual}.log"
-echo "Command: $0 $@" > "$log_file"
-echo "Job started on: $(date)" >> "$log_file"
-# Function to download an SRA accession using fastq-dump with --split-3
 download_sra() {
     local acc="$1"
-    echo "Downloading accession: $acc" >> "$log_file"
+    echo "Downloading accession: $acc"
-    fastq-dump --gzip --split-3 "$acc" -O "$outdir"
+    if fastq-dump --gzip --split-3 "$acc" -O "$outdir"; then
-    if [ "$?" -ne 0 ]; then
+        echo "Successfully downloaded: $acc"
-        echo "Error: fastq-dump failed for accession: $acc" >> "$log_file"
-        return 1
-    else
-        echo "Successfully downloaded: $acc" >> "$log_file"
         return 0
+    else
+        echo "Error: fastq-dump failed for accession: $acc"
+        return 1
     fi
 }
@@ Line 96: / Line 82: @@
     local attempt=1
     while [ $attempt -le $max_retries ]; do
-        echo "Attempt $attempt for $acc" >> "$log_file"
+        echo "Attempt $attempt for $acc"
         if download_sra "$acc"; then
             return 0  # Success
@@ Line 102: / Line 88: @@
         ((attempt++))
     done
-    echo "Failed all $max_retries attempts for $acc" >> "$log_file"
+    echo "Failed all $max_retries attempts for $acc"
-    # Append the failed accession to the failed_sra.txt file, one per line
-    echo "$acc" >> "$failed_file"
     return 1
 }
@@ Line 111: / Line 95: @@
 export -f download_sra download_with_retry
 export outdir
-export log_file
-export failed_file
-# Process all accessions in parallel (ignore lines starting with #)
+# Process all accessions in parallel
 accessions=$(grep -v '^#' "$accession_file")
 if [ -z "$accessions" ]; then
-    echo "Error: No valid accessions found in '$accession_file'." >> "$log_file"
+    echo "Error: No valid accessions found in '$accession_file'."
     exit 1
 fi
-echo "Processing accessions in parallel..." >> "$log_file"
+echo "Processing accessions in parallel..."
 parallel -j 20 download_with_retry ::: $accessions
-# Append an extra newline to the failed downloads file
-echo "" >> "$failed_file"
 # Check overall exit status and log the result
 if [ "$?" -eq 0 ]; then
-    echo "All accessions processed successfully." >> "$log_file"
+    echo "All accessions processed successfully."
 else
-    echo "One or more accessions encountered errors." >> "$log_file"
+    echo "One or more accessions encountered errors."
 fi
-echo "Job completed on: $(date)" >> "$log_file"
 </code>
@@ Line 146: / Line 123: @@
 #!/bin/bash
 #SBATCH --mem=10gb
-#SBATCH --cpus-per-task=10
+#SBATCH --cpus-per-task=30
 #SBATCH --job-name=dwnld_fq
-#SBATCH --output=log/download_fq_%j.log  # %j will be replaced with the job ID
+#SBATCH --output=log/fq_download_00.log  # Main log file name
-# Parameters (example, modify as needed)
+# Parameters
-outdir="fq"
+outdir="fq_original"
 accession_file="meta/sra_accessions.txt"
+logfile="log/fq_download_00.log"
-src/fq_download.sh $outdir $accession_file
+# Start logging
+echo "Started at: $(date)" >> "$logfile"
+echo "Running fq_download.sh with:" > "$logfile"
+echo "Output dir: $outdir" >> "$logfile"
+echo "Accession file: $accession_file" >> "$logfile"
+# Call the daughter script and redirect both stdout and stderr to the same log
+src/fq_download.sh "$outdir" "$accession_file" >> "$logfile" 2>&1
+# Log end time
+echo "Finished at: $(date)" >> "$logfile"
 </code>
@@ Line 164: / Line 152: @@
 In this way, you can have a name for a parent script for each download attempt. And you'll have a log file with the same name. And you won't have to copy paste the whole code in each of the parent scripts, just call the daughter script and that's all!
+====== Download files from EGA ======
+With this script you can download files from EGA database using pyega3 and have them located directly in the output directory.
+You need to set some inputs:
+  * Credentials json file
+  * Connections
+  * List of file IDs that are to be downloaded
+  * Output directory
+  * Specify the files format that are to be downloaded
+Make sure you set the number of cpus-per-task the multiplication of the number of files to be downloaded and the number of connections.
+Here is an example of a credentials json file:
+<code>
+{
+    "username": "name.surname@abi.am",
+    "password": "your_password"
+}
+</code>
+==The script==
+<code>
+#!/bin/bash
+#SBATCH --mem=10gb
+#SBATCH --cpus-per-task=1
+#SBATCH --job-name=dwnld_ega
+#SBATCH --output=log/dwnld_ega.log
+# Set cpus-per-task the number of files to be downloaded
+# Set common variables
+CREDENTIALS_FILE="meta/credentials.json"
+CONNECTIONS=1
+# Define the paths to the text files containing the file IDs
+FILE_ID_LIST="meta/test.txt"
+# Define output directories
+FILE_OUTPUT_DIR="output_dir"
+# Define file format
+file_format=".bam"
+# --- Step 1: Create directories if they don't exist ---
+echo "Creating necessary directories..."
+mkdir -p $FILE_OUTPUT_DIR meta/md5sum log
+# --- Step 3: Download files, move, and clean up temporary folders ---
+echo "Starting downloads for files from $FILE_ID_LIST..."
+# Check if the RNA-seq ID list file exists
+if [ ! -f "$FILE_ID_LIST" ]; then
+  echo "Error: ID list file not found at $FILE_ID_LIST"
+  exit 1
+fi
+while IFS= read -r file_id; do
+  if [ -z "$file_id" ]; then
+    continue
+  fi
+  echo "Downloading file with ID: $file_id"
+  pyega3 -c "$CONNECTIONS" -cf "$CREDENTIALS_FILE" fetch "$file_id" --output-dir "$FILE_OUTPUT_DIR" &
+done < "$FILE_ID_LIST"
+wait
+# Move files to the final location and remove temporary folders
+echo "Moving downloaded files and cleaning up..."
+while IFS= read -r file_id; do
+  if [ -z "$file_id" ]; then
+    continue
+  fi
+  mv "$FILE_OUTPUT_DIR/$file_id"/*"$file_format" "$FILE_OUTPUT_DIR/"
+  rm -r "$FILE_OUTPUT_DIR/$file_id"
+done < "$FILE_ID_LIST"
+# --- Step 4: Perform md5sum on all final files ---
+echo "Performing md5sum on all downloaded files..."
+# Loop through all files in the  directories
+for file in $FILE_OUTPUT_DIR/*"$file_format"; do
+  if [ -f "$file" ]; then # Check if the file exists
+    filename=$(basename "$file")
+    md5sum "$file" > "meta/md5sum/${filename}.md5"
+    echo "Generated md5sum for $file"
+  fi
+done
+# --- Step 5: Remove unnecessary log files created by pyega3 ---
+if [ -f pyega3_output.log ]; then
+  rm pyega3_output.log
+  echo "pyega3_output.log removed."
+fi
+echo "Script finished."
+</code>