User Tools

Site Tools


scripts:download_fastq

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
scripts:download_fastq [2025/03/26 07:23] 37.26.174.181scripts:download_fastq [2025/08/06 08:38] (current) 37.26.174.181
Line 7: Line 7:
 The easiest way to download FASTQ files from SRA is using the fastq-dump command from the SRA Toolkit with the command:  The easiest way to download FASTQ files from SRA is using the fastq-dump command from the SRA Toolkit with the command: 
  
-''fastq-dump --gzip --split3 SRR[accession ID]''+''fastq-dump --gzip --split-3 SRR[accession ID]''
  
 Options: Options:
Line 23: Line 23:
  
 <code> <code>
 +
 +#!/bin/bash
  
 # SCRIPT FOR DOWNLOADING SRA FILES USING THE NCBI SRA TOOLKIT # SCRIPT FOR DOWNLOADING SRA FILES USING THE NCBI SRA TOOLKIT
-# NOTE: Run this script from the directory where the "log" and "meta" directories are located+# NOTE: Run this script from the directory where the "log" directory is located
 # #
 # PURPOSE: # PURPOSE:
 #   This script reads SRA accession IDs from a given file (one per line) #   This script reads SRA accession IDs from a given file (one per line)
-#   and downloads each corresponding SRA file using fastq-dump.  +#   and downloads each corresponding SRA file using fastq-dump.
-#   The script outputs the SRA accession IDs that failed to be downloaded in a .txt file (one per line)  +
 # #
 # PARAMETERS: # PARAMETERS:
Line 37: Line 38:
 # #
 # SAMPLE USAGE: # SAMPLE USAGE:
-#   sbatch src/download_sra.sh <output_directory> <accession_file>+#   sbatch src/fq_download.sh <output_directory> <accession_file>
 # #
 # IMPORTANT: # IMPORTANT:
-#   - This script downloads files using fastq-dump, gzips and splits paired-end reads +#   - This script downloads files using fastq-dump, gzips and splits paired-end reads (it does nothing to single-end read).
-#     (it does nothing to single-end reads).+
 #   - Ensure that the SRA Toolkit is installed and available. #   - Ensure that the SRA Toolkit is installed and available.
  
Line 63: Line 63:
 mkdir -p "$outdir" mkdir -p "$outdir"
  
-# Define the meta directory and create it if it doesn't exist. +# Function to download an SRA accession using fastq-dump (without split/gzip)
-meta_dir="meta" +
-mkdir -p "$meta_dir" +
- +
-# Create (or empty) the failed downloads file in the meta directory +
-failed_file="$meta_dir/failed_sra.txt" +
-> "$failed_file"  # Truncate or create the file +
- +
-# Define the log file with a fallback if SLURM_JOB_ID is not set +
-log_file="log/download_sra_retry${SLURM_JOB_ID:-manual}.log" +
-echo "Command: $0 $@" > "$log_file" +
-echo "Job started on: $(date)" >> "$log_file" +
- +
-# Function to download an SRA accession using fastq-dump with --split-3+
 download_sra() { download_sra() {
     local acc="$1"     local acc="$1"
-    echo "Downloading accession: $acc" >> "$log_file+    echo "Downloading accession: $acc" 
-    fastq-dump --gzip --split-3 "$acc" -O "$outdir" +    if fastq-dump --gzip --split-3 "$acc" -O "$outdir"; then 
-    if [ "$?" -ne 0 ]; then +        echo "Successfully downloaded: $acc"
-        echo "Error: fastq-dump failed for accession: $acc" >> "$log_file" +
-        return 1 +
-    else +
-        echo "Successfully downloaded: $acc" >> "$log_file"+
         return 0         return 0
 +    else
 +        echo "Error: fastq-dump failed for accession: $acc"
 +        return 1
     fi     fi
 } }
Line 96: Line 82:
     local attempt=1     local attempt=1
     while [ $attempt -le $max_retries ]; do     while [ $attempt -le $max_retries ]; do
-        echo "Attempt $attempt for $acc" >> "$log_file"+        echo "Attempt $attempt for $acc"
         if download_sra "$acc"; then         if download_sra "$acc"; then
             return 0  # Success             return 0  # Success
Line 102: Line 88:
         ((attempt++))         ((attempt++))
     done     done
-    echo "Failed all $max_retries attempts for $acc" >> "$log_file" +    echo "Failed all $max_retries attempts for $acc"
-    # Append the failed accession to the failed_sra.txt file, one per line +
-    echo "$acc" >> "$failed_file"+
     return 1     return 1
 } }
Line 111: Line 95:
 export -f download_sra download_with_retry export -f download_sra download_with_retry
 export outdir export outdir
-export log_file 
-export failed_file 
  
-# Process all accessions in parallel (ignore lines starting with #)+# Process all accessions in parallel
 accessions=$(grep -v '^#' "$accession_file") accessions=$(grep -v '^#' "$accession_file")
 if [ -z "$accessions" ]; then if [ -z "$accessions" ]; then
-    echo "Error: No valid accessions found in '$accession_file'." >> "$log_file"+    echo "Error: No valid accessions found in '$accession_file'."
     exit 1     exit 1
 fi fi
  
-echo "Processing accessions in parallel..." >> "$log_file"+echo "Processing accessions in parallel..."
 parallel -j 20 download_with_retry ::: $accessions parallel -j 20 download_with_retry ::: $accessions
- 
-# Append an extra newline to the failed downloads file 
-echo "" >> "$failed_file" 
  
 # Check overall exit status and log the result # Check overall exit status and log the result
 if [ "$?" -eq 0 ]; then if [ "$?" -eq 0 ]; then
-    echo "All accessions processed successfully." >> "$log_file"+    echo "All accessions processed successfully."
 else else
-    echo "One or more accessions encountered errors." >> "$log_file"+    echo "One or more accessions encountered errors."
 fi fi
- 
-echo "Job completed on: $(date)" >> "$log_file" 
  
 </code> </code>
Line 146: Line 123:
 #!/bin/bash #!/bin/bash
 #SBATCH --mem=10gb #SBATCH --mem=10gb
-#SBATCH --cpus-per-task=10+#SBATCH --cpus-per-task=30
 #SBATCH --job-name=dwnld_fq #SBATCH --job-name=dwnld_fq
-#SBATCH --output=log/download_fq_%j.log  # %j will be replaced with the job ID+#SBATCH --output=log/fq_download_00.log  # Main log file name
  
-# Parameters (example, modify as needed) +# Parameters 
-outdir="fq +outdir="fq_original"
 accession_file="meta/sra_accessions.txt" accession_file="meta/sra_accessions.txt"
 +logfile="log/fq_download_00.log"
  
-src/fq_download.sh $outdir $accession_file+# Start logging 
 +echo "Started at: $(date)" >> "$logfile" 
 +echo "Running fq_download.sh with:" > "$logfile" 
 +echo "Output dir: $outdir" >> "$logfile" 
 +echo "Accession file: $accession_file" >> "$logfile" 
 + 
 +# Call the daughter script and redirect both stdout and stderr to the same log 
 +src/fq_download.sh "$outdir" "$accession_file" >> "$logfile" 2>&
 + 
 +# Log end time 
 +echo "Finished at: $(date)" >> "$logfile"
  
 </code> </code>
Line 164: Line 152:
  
 In this way, you can have a name for a parent script for each download attempt. And you'll have a log file with the same name. And you won't have to copy paste the whole code in each of the parent scripts, just call the daughter script and that's all!  In this way, you can have a name for a parent script for each download attempt. And you'll have a log file with the same name. And you won't have to copy paste the whole code in each of the parent scripts, just call the daughter script and that's all! 
 +
 +====== Download files from EGA ======
 +
 +With this script you can download files from EGA database using pyega3 and have them located directly in the output directory.
 +
 +You need to set some inputs:
 +
 +  * Credentials json file
 +  * Connections
 +  * List of file IDs that are to be downloaded
 +  * Output directory
 +  * Specify the files format that are to be downloaded
 +
 +Make sure you set the number of cpus-per-task the multiplication of the number of files to be downloaded and the number of connections.
 +
 +Here is an example of a credentials json file:
 +
 +<code>
 +{
 +    "username": "name.surname@abi.am",
 +    "password": "your_password"
 +}
 +</code>
 +
 +==The script==
 +
 +<code>
 +#!/bin/bash
 +#SBATCH --mem=10gb
 +#SBATCH --cpus-per-task=1
 +#SBATCH --job-name=dwnld_ega
 +#SBATCH --output=log/dwnld_ega.log
 +
 +# Set cpus-per-task the number of files to be downloaded
 +# Set common variables
 +CREDENTIALS_FILE="meta/credentials.json"
 +CONNECTIONS=1
 +
 +# Define the paths to the text files containing the file IDs
 +FILE_ID_LIST="meta/test.txt"
 +
 +# Define output directories
 +FILE_OUTPUT_DIR="output_dir"
 +
 +# Define file format
 +file_format=".bam"
 +
 +# --- Step 1: Create directories if they don't exist ---
 +echo "Creating necessary directories..."
 +mkdir -p $FILE_OUTPUT_DIR meta/md5sum log
 +
 +# --- Step 3: Download files, move, and clean up temporary folders ---
 +echo "Starting downloads for files from $FILE_ID_LIST..."
 +# Check if the RNA-seq ID list file exists
 +if [ ! -f "$FILE_ID_LIST" ]; then
 +  echo "Error: ID list file not found at $FILE_ID_LIST"
 +  exit 1
 +fi
 +
 +while IFS= read -r file_id; do
 +  if [ -z "$file_id" ]; then
 +    continue
 +  fi
 +  echo "Downloading file with ID: $file_id"
 +  pyega3 -c "$CONNECTIONS" -cf "$CREDENTIALS_FILE" fetch "$file_id" --output-dir "$FILE_OUTPUT_DIR" &
 +done < "$FILE_ID_LIST"
 +wait
 +
 +# Move files to the final location and remove temporary folders
 +echo "Moving downloaded files and cleaning up..."
 +while IFS= read -r file_id; do
 +  if [ -z "$file_id" ]; then
 +    continue
 +  fi
 +  mv "$FILE_OUTPUT_DIR/$file_id"/*"$file_format" "$FILE_OUTPUT_DIR/"
 +  rm -r "$FILE_OUTPUT_DIR/$file_id"
 +done < "$FILE_ID_LIST"
 +
 +# --- Step 4: Perform md5sum on all final files ---
 +echo "Performing md5sum on all downloaded files..."
 +# Loop through all files in the  directories
 +for file in $FILE_OUTPUT_DIR/*"$file_format"; do
 +  if [ -f "$file" ]; then # Check if the file exists
 +    filename=$(basename "$file")
 +    md5sum "$file" > "meta/md5sum/${filename}.md5"
 +    echo "Generated md5sum for $file"
 +  fi
 +done
 +
 +# --- Step 5: Remove unnecessary log files created by pyega3 ---
 +if [ -f pyega3_output.log ]; then
 +  rm pyega3_output.log
 +  echo "pyega3_output.log removed."
 +fi
 +
 +echo "Script finished."
 +</code>
 +
scripts/download_fastq.1742973802.txt.gz · Last modified: by 37.26.174.181

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki