Buckets:
| import wget | |
| import os | |
| import multiprocessing | |
| from functools import partial | |
| import time | |
| save_dir = "/workspace/seungheon/dataset" | |
| os.makedirs(save_dir, exist_ok=True) | |
| urls = [] | |
| db_config = {"fma": 34, "mtg_jamendo": 134, "medleydb": 100, "moisesdb": 8, "musicnet": 21} | |
| for db_name, num_files in db_config.items(): | |
| for i in range(num_files): | |
| urls.append(f"https://huggingface.co/datasets/seungheondoh/cmd-audio-dump/resolve/main/{db_name}{i}.tar.gz") | |
| def download_and_unzip(url): | |
| # Download file | |
| filename = wget.download(url) | |
| # Unzip file | |
| with tarfile.open(filename, 'r:gz') as tar: | |
| tar.extractall(path=save_dir) | |
| if __name__ == "__main__": | |
| os.makedirs(save_dir, exist_ok=True) | |
| # Start timing | |
| start_time = time.time() | |
| num_processes = min(multiprocessing.cpu_count(), len(urls)) | |
| with multiprocessing.Pool(processes=num_processes) as pool: | |
| pool.map(download_and_unzip, urls) | |
| # Calculate and display total time | |
| end_time = time.time() | |
| elapsed = end_time - start_time | |
| print(f"\nTotal download time: {int(elapsed // 60)} minutes and {int(elapsed % 60)} seconds") | |
Xet Storage Details
- Size:
- 1.15 kB
- Xet hash:
- c681d8f6e3fb69c91341bf8b6a6432e4dab2ea739572214a008a26590cfc2cb5
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.