scripts:download_fastq
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| scripts:download_fastq [2025/03/07 11:05] – 37.26.174.181 | scripts:download_fastq [2025/08/06 08:38] (current) – 37.26.174.181 | ||
|---|---|---|---|
| Line 7: | Line 7: | ||
| The easiest way to download FASTQ files from SRA is using the fastq-dump command from the SRA Toolkit with the command: | The easiest way to download FASTQ files from SRA is using the fastq-dump command from the SRA Toolkit with the command: | ||
| - | '' | + | '' |
| Options: | Options: | ||
| '' | '' | ||
| - | '' | + | '' |
| However, you would like to put this command inside a script to download multiple files via slurm. In addition, it's recommended to complicate the script a bit to account for possible connection issues during download. Below is an example script that downloads a set of files with the provided accessions with multiple attempts to download each file in case of failures. | However, you would like to put this command inside a script to download multiple files via slurm. In addition, it's recommended to complicate the script a bit to account for possible connection issues during download. Below is an example script that downloads a set of files with the provided accessions with multiple attempts to download each file in case of failures. | ||
| Line 23: | Line 23: | ||
| < | < | ||
| + | |||
| + | #!/bin/bash | ||
| + | |||
| # SCRIPT FOR DOWNLOADING SRA FILES USING THE NCBI SRA TOOLKIT | # SCRIPT FOR DOWNLOADING SRA FILES USING THE NCBI SRA TOOLKIT | ||
| # NOTE: Run this script from the directory where the " | # NOTE: Run this script from the directory where the " | ||
| Line 35: | Line 38: | ||
| # | # | ||
| # SAMPLE USAGE: | # SAMPLE USAGE: | ||
| - | # | + | # |
| # | # | ||
| # IMPORTANT: | # IMPORTANT: | ||
| Line 59: | Line 62: | ||
| # Create the output directory if it doesn' | # Create the output directory if it doesn' | ||
| mkdir -p " | mkdir -p " | ||
| - | |||
| - | # Define the log file with a fallback if SLURM_JOB_ID is not set | ||
| - | log_file=" | ||
| - | echo " | ||
| - | echo "Job started on: $(date)" | ||
| # Function to download an SRA accession using fastq-dump (without split/gzip) | # Function to download an SRA accession using fastq-dump (without split/gzip) | ||
| download_sra() { | download_sra() { | ||
| local acc=" | local acc=" | ||
| - | echo " | + | echo " |
| - | fastq-dump --gzip --split-files " | + | |
| - | if [ "$?" | + | |
| - | echo " | + | return |
| + | else | ||
| + | echo " | ||
| return 1 | return 1 | ||
| - | else | ||
| - | echo " | ||
| - | return 0 | ||
| fi | fi | ||
| } | } | ||
| Line 85: | Line 82: | ||
| local attempt=1 | local attempt=1 | ||
| while [ $attempt -le $max_retries ]; do | while [ $attempt -le $max_retries ]; do | ||
| - | echo " | + | echo " |
| if download_sra " | if download_sra " | ||
| return 0 # Success | return 0 # Success | ||
| Line 91: | Line 88: | ||
| ((attempt++)) | ((attempt++)) | ||
| done | done | ||
| - | echo " | + | echo " |
| return 1 | return 1 | ||
| } | } | ||
| Line 98: | Line 95: | ||
| export -f download_sra download_with_retry | export -f download_sra download_with_retry | ||
| export outdir | export outdir | ||
| - | export log_file | ||
| # Process all accessions in parallel | # Process all accessions in parallel | ||
| accessions=$(grep -v ' | accessions=$(grep -v ' | ||
| if [ -z " | if [ -z " | ||
| - | echo " | + | echo " |
| exit 1 | exit 1 | ||
| fi | fi | ||
| - | echo " | + | echo " |
| parallel -j 20 download_with_retry ::: $accessions | parallel -j 20 download_with_retry ::: $accessions | ||
| # Check overall exit status and log the result | # Check overall exit status and log the result | ||
| if [ " | if [ " | ||
| - | echo "All accessions processed successfully." >> " | + | echo "All accessions processed successfully." |
| else | else | ||
| - | echo "One or more accessions encountered errors." >> " | + | echo "One or more accessions encountered errors." |
| fi | fi | ||
| - | |||
| - | echo "Job completed on: $(date)" | ||
| - | |||
| </ | </ | ||
| Line 124: | Line 117: | ||
| ==== Parent script (for slurm) ==== | ==== Parent script (for slurm) ==== | ||
| - | To run the script with your list of files, create a txt file with SRA Accessions per line. And make a separate script as follows: | + | To run the script with your list of files, create a txt file with SRA Accessions per line. |
| + | And create | ||
| < | < | ||
| #!/bin/bash | #!/bin/bash | ||
| #SBATCH --mem=10gb | #SBATCH --mem=10gb | ||
| - | #SBATCH --cpus-per-task=10 | + | #SBATCH --cpus-per-task=30 |
| #SBATCH --job-name=dwnld_fq | #SBATCH --job-name=dwnld_fq | ||
| - | #SBATCH --output=log/ | + | #SBATCH --output=log/ |
| - | # Parameters | + | # Parameters |
| - | outdir=" | + | outdir=" |
| - | accession_file=" | + | accession_file=" |
| + | logfile=" | ||
| - | fq_download.sh $outdir $accession_file | + | # Start logging |
| + | echo " | ||
| + | echo " | ||
| + | echo " | ||
| + | echo " | ||
| + | |||
| + | # Call the daughter script and redirect both stdout and stderr to the same log | ||
| + | src/ | ||
| + | |||
| + | # Log end time | ||
| + | echo " | ||
| </ | </ | ||
| + | |||
| + | Make sure the daughter script' | ||
| + | |||
| + | ==== Why is it advisable to use parent and daughter scripts? ==== | ||
| + | It is a great practice to log everything you do. It's useful for troubleshooting in the future. If you just have one script and add the parameters on the go, you will not be able to trace back those parameters based on the log files. In other words, you'll have no idea what input you've used to produce your log files. | ||
| + | |||
| + | In this way, you can have a name for a parent script for each download attempt. And you'll have a log file with the same name. And you won't have to copy paste the whole code in each of the parent scripts, just call the daughter script and that's all! | ||
| + | |||
| + | ====== Download files from EGA ====== | ||
| + | |||
| + | With this script you can download files from EGA database using pyega3 and have them located directly in the output directory. | ||
| + | |||
| + | You need to set some inputs: | ||
| + | |||
| + | * Credentials json file | ||
| + | * Connections | ||
| + | * List of file IDs that are to be downloaded | ||
| + | * Output directory | ||
| + | * Specify the files format that are to be downloaded | ||
| + | |||
| + | Make sure you set the number of cpus-per-task the multiplication of the number of files to be downloaded and the number of connections. | ||
| + | |||
| + | Here is an example of a credentials json file: | ||
| + | |||
| + | < | ||
| + | { | ||
| + | " | ||
| + | " | ||
| + | } | ||
| + | </ | ||
| + | |||
| + | ==The script== | ||
| + | |||
| + | < | ||
| + | #!/bin/bash | ||
| + | #SBATCH --mem=10gb | ||
| + | #SBATCH --cpus-per-task=1 | ||
| + | #SBATCH --job-name=dwnld_ega | ||
| + | #SBATCH --output=log/ | ||
| + | |||
| + | # Set cpus-per-task the number of files to be downloaded | ||
| + | # Set common variables | ||
| + | CREDENTIALS_FILE=" | ||
| + | CONNECTIONS=1 | ||
| + | |||
| + | # Define the paths to the text files containing the file IDs | ||
| + | FILE_ID_LIST=" | ||
| + | |||
| + | # Define output directories | ||
| + | FILE_OUTPUT_DIR=" | ||
| + | |||
| + | # Define file format | ||
| + | file_format=" | ||
| + | |||
| + | # --- Step 1: Create directories if they don't exist --- | ||
| + | echo " | ||
| + | mkdir -p $FILE_OUTPUT_DIR meta/md5sum log | ||
| + | |||
| + | # --- Step 3: Download files, move, and clean up temporary folders --- | ||
| + | echo " | ||
| + | # Check if the RNA-seq ID list file exists | ||
| + | if [ ! -f " | ||
| + | echo " | ||
| + | exit 1 | ||
| + | fi | ||
| + | |||
| + | while IFS= read -r file_id; do | ||
| + | if [ -z " | ||
| + | continue | ||
| + | fi | ||
| + | echo " | ||
| + | pyega3 -c " | ||
| + | done < " | ||
| + | wait | ||
| + | |||
| + | # Move files to the final location and remove temporary folders | ||
| + | echo " | ||
| + | while IFS= read -r file_id; do | ||
| + | if [ -z " | ||
| + | continue | ||
| + | fi | ||
| + | mv " | ||
| + | rm -r " | ||
| + | done < " | ||
| + | |||
| + | # --- Step 4: Perform md5sum on all final files --- | ||
| + | echo " | ||
| + | # Loop through all files in the directories | ||
| + | for file in $FILE_OUTPUT_DIR/ | ||
| + | if [ -f " | ||
| + | filename=$(basename " | ||
| + | md5sum " | ||
| + | echo " | ||
| + | fi | ||
| + | done | ||
| + | |||
| + | # --- Step 5: Remove unnecessary log files created by pyega3 --- | ||
| + | if [ -f pyega3_output.log ]; then | ||
| + | rm pyega3_output.log | ||
| + | echo " | ||
| + | fi | ||
| + | |||
| + | echo " | ||
| + | </ | ||
| + | |||
scripts/download_fastq.1741345537.txt.gz · Last modified: by 37.26.174.181
