scripts:download_fastq
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| scripts:download_fastq [2025/03/26 07:23] – 37.26.174.181 | scripts:download_fastq [2025/08/06 08:38] (current) – 37.26.174.181 | ||
|---|---|---|---|
| Line 7: | Line 7: | ||
| The easiest way to download FASTQ files from SRA is using the fastq-dump command from the SRA Toolkit with the command: | The easiest way to download FASTQ files from SRA is using the fastq-dump command from the SRA Toolkit with the command: | ||
| - | '' | + | '' |
| Options: | Options: | ||
| Line 23: | Line 23: | ||
| < | < | ||
| + | |||
| + | #!/bin/bash | ||
| # SCRIPT FOR DOWNLOADING SRA FILES USING THE NCBI SRA TOOLKIT | # SCRIPT FOR DOWNLOADING SRA FILES USING THE NCBI SRA TOOLKIT | ||
| - | # NOTE: Run this script from the directory where the " | + | # NOTE: Run this script from the directory where the " |
| # | # | ||
| # PURPOSE: | # PURPOSE: | ||
| # This script reads SRA accession IDs from a given file (one per line) | # This script reads SRA accession IDs from a given file (one per line) | ||
| - | # and downloads each corresponding SRA file using fastq-dump. | + | # and downloads each corresponding SRA file using fastq-dump. |
| - | # The script outputs the SRA accession IDs that failed to be downloaded in a .txt file (one per line) | + | |
| # | # | ||
| # PARAMETERS: | # PARAMETERS: | ||
| Line 37: | Line 38: | ||
| # | # | ||
| # SAMPLE USAGE: | # SAMPLE USAGE: | ||
| - | # | + | # |
| # | # | ||
| # IMPORTANT: | # IMPORTANT: | ||
| - | # - This script downloads files using fastq-dump, gzips and splits paired-end reads | + | # - This script downloads files using fastq-dump, gzips and splits paired-end reads (it does nothing to single-end |
| - | # (it does nothing to single-end | + | |
| # - Ensure that the SRA Toolkit is installed and available. | # - Ensure that the SRA Toolkit is installed and available. | ||
| Line 63: | Line 63: | ||
| mkdir -p " | mkdir -p " | ||
| - | # Define the meta directory and create it if it doesn' | + | # Function to download an SRA accession using fastq-dump |
| - | meta_dir=" | + | |
| - | mkdir -p " | + | |
| - | + | ||
| - | # Create (or empty) the failed downloads file in the meta directory | + | |
| - | failed_file=" | + | |
| - | > " | + | |
| - | + | ||
| - | # Define the log file with a fallback if SLURM_JOB_ID is not set | + | |
| - | log_file=" | + | |
| - | echo " | + | |
| - | echo "Job started on: $(date)" | + | |
| - | + | ||
| - | # Function to download an SRA accession using fastq-dump | + | |
| download_sra() { | download_sra() { | ||
| local acc=" | local acc=" | ||
| - | echo " | + | echo " |
| - | fastq-dump --gzip --split-3 " | + | |
| - | if [ " | + | echo " |
| - | echo " | + | |
| - | return 1 | + | |
| - | else | + | |
| - | echo " | + | |
| return 0 | return 0 | ||
| + | else | ||
| + | echo " | ||
| + | return 1 | ||
| fi | fi | ||
| } | } | ||
| Line 96: | Line 82: | ||
| local attempt=1 | local attempt=1 | ||
| while [ $attempt -le $max_retries ]; do | while [ $attempt -le $max_retries ]; do | ||
| - | echo " | + | echo " |
| if download_sra " | if download_sra " | ||
| return 0 # Success | return 0 # Success | ||
| Line 102: | Line 88: | ||
| ((attempt++)) | ((attempt++)) | ||
| done | done | ||
| - | echo " | + | echo " |
| - | # Append the failed accession to the failed_sra.txt file, one per line | + | |
| - | echo " | + | |
| return 1 | return 1 | ||
| } | } | ||
| Line 111: | Line 95: | ||
| export -f download_sra download_with_retry | export -f download_sra download_with_retry | ||
| export outdir | export outdir | ||
| - | export log_file | ||
| - | export failed_file | ||
| - | # Process all accessions in parallel | + | # Process all accessions in parallel |
| accessions=$(grep -v ' | accessions=$(grep -v ' | ||
| if [ -z " | if [ -z " | ||
| - | echo " | + | echo " |
| exit 1 | exit 1 | ||
| fi | fi | ||
| - | echo " | + | echo " |
| parallel -j 20 download_with_retry ::: $accessions | parallel -j 20 download_with_retry ::: $accessions | ||
| - | |||
| - | # Append an extra newline to the failed downloads file | ||
| - | echo "" | ||
| # Check overall exit status and log the result | # Check overall exit status and log the result | ||
| if [ " | if [ " | ||
| - | echo "All accessions processed successfully." >> " | + | echo "All accessions processed successfully." |
| else | else | ||
| - | echo "One or more accessions encountered errors." >> " | + | echo "One or more accessions encountered errors." |
| fi | fi | ||
| - | |||
| - | echo "Job completed on: $(date)" | ||
| </ | </ | ||
| Line 146: | Line 123: | ||
| #!/bin/bash | #!/bin/bash | ||
| #SBATCH --mem=10gb | #SBATCH --mem=10gb | ||
| - | #SBATCH --cpus-per-task=10 | + | #SBATCH --cpus-per-task=30 |
| #SBATCH --job-name=dwnld_fq | #SBATCH --job-name=dwnld_fq | ||
| - | #SBATCH --output=log/ | + | #SBATCH --output=log/ |
| - | # Parameters | + | # Parameters |
| - | outdir=" | + | outdir=" |
| accession_file=" | accession_file=" | ||
| + | logfile=" | ||
| - | src/ | + | # Start logging |
| + | echo " | ||
| + | echo " | ||
| + | echo " | ||
| + | echo " | ||
| + | |||
| + | # Call the daughter script and redirect both stdout and stderr to the same log | ||
| + | src/ | ||
| + | |||
| + | # Log end time | ||
| + | echo " | ||
| </ | </ | ||
| Line 164: | Line 152: | ||
| In this way, you can have a name for a parent script for each download attempt. And you'll have a log file with the same name. And you won't have to copy paste the whole code in each of the parent scripts, just call the daughter script and that's all! | In this way, you can have a name for a parent script for each download attempt. And you'll have a log file with the same name. And you won't have to copy paste the whole code in each of the parent scripts, just call the daughter script and that's all! | ||
| + | |||
| + | ====== Download files from EGA ====== | ||
| + | |||
| + | With this script you can download files from EGA database using pyega3 and have them located directly in the output directory. | ||
| + | |||
| + | You need to set some inputs: | ||
| + | |||
| + | * Credentials json file | ||
| + | * Connections | ||
| + | * List of file IDs that are to be downloaded | ||
| + | * Output directory | ||
| + | * Specify the files format that are to be downloaded | ||
| + | |||
| + | Make sure you set the number of cpus-per-task the multiplication of the number of files to be downloaded and the number of connections. | ||
| + | |||
| + | Here is an example of a credentials json file: | ||
| + | |||
| + | < | ||
| + | { | ||
| + | " | ||
| + | " | ||
| + | } | ||
| + | </ | ||
| + | |||
| + | ==The script== | ||
| + | |||
| + | < | ||
| + | #!/bin/bash | ||
| + | #SBATCH --mem=10gb | ||
| + | #SBATCH --cpus-per-task=1 | ||
| + | #SBATCH --job-name=dwnld_ega | ||
| + | #SBATCH --output=log/ | ||
| + | |||
| + | # Set cpus-per-task the number of files to be downloaded | ||
| + | # Set common variables | ||
| + | CREDENTIALS_FILE=" | ||
| + | CONNECTIONS=1 | ||
| + | |||
| + | # Define the paths to the text files containing the file IDs | ||
| + | FILE_ID_LIST=" | ||
| + | |||
| + | # Define output directories | ||
| + | FILE_OUTPUT_DIR=" | ||
| + | |||
| + | # Define file format | ||
| + | file_format=" | ||
| + | |||
| + | # --- Step 1: Create directories if they don't exist --- | ||
| + | echo " | ||
| + | mkdir -p $FILE_OUTPUT_DIR meta/md5sum log | ||
| + | |||
| + | # --- Step 3: Download files, move, and clean up temporary folders --- | ||
| + | echo " | ||
| + | # Check if the RNA-seq ID list file exists | ||
| + | if [ ! -f " | ||
| + | echo " | ||
| + | exit 1 | ||
| + | fi | ||
| + | |||
| + | while IFS= read -r file_id; do | ||
| + | if [ -z " | ||
| + | continue | ||
| + | fi | ||
| + | echo " | ||
| + | pyega3 -c " | ||
| + | done < " | ||
| + | wait | ||
| + | |||
| + | # Move files to the final location and remove temporary folders | ||
| + | echo " | ||
| + | while IFS= read -r file_id; do | ||
| + | if [ -z " | ||
| + | continue | ||
| + | fi | ||
| + | mv " | ||
| + | rm -r " | ||
| + | done < " | ||
| + | |||
| + | # --- Step 4: Perform md5sum on all final files --- | ||
| + | echo " | ||
| + | # Loop through all files in the directories | ||
| + | for file in $FILE_OUTPUT_DIR/ | ||
| + | if [ -f " | ||
| + | filename=$(basename " | ||
| + | md5sum " | ||
| + | echo " | ||
| + | fi | ||
| + | done | ||
| + | |||
| + | # --- Step 5: Remove unnecessary log files created by pyega3 --- | ||
| + | if [ -f pyega3_output.log ]; then | ||
| + | rm pyega3_output.log | ||
| + | echo " | ||
| + | fi | ||
| + | |||
| + | echo " | ||
| + | </ | ||
| + | |||
scripts/download_fastq.1742973802.txt.gz · Last modified: by 37.26.174.181
