# default_exp datasets.download
Dowload datasetsΒΆ
#export
def get_cifar10(output_dir: Path):
"""
Download the cifar10 dataset.
"""
output_dir = Path(output_dir)
dataset_dir = output_dir / 'cifar10'
_download_url(url='https://s3.amazonaws.com/fast-ai-imageclas/',
root=output_dir, filename="cifar10.tgz",
file_hash=("sha256:637c5814e11aefcb6ee76d5f"
"59c67ddc8de7f5b5077502a195b0833d1e3e4441"))
if not dataset_dir.is_dir():
_extract_tar(output_dir / 'cifar10.tgz', output_dir)
else:
print(f'Directory {dataset_dir} already exists, skip extraction.')
print('Generating train/test data..')
imdir_train = dataset_dir / 'train'
imdir_test = dataset_dir / 'test'
# split train/test
train = [Path(p) for p in glob.glob(f'{imdir_train}/*/*')]
test = [Path(p) for p in glob.glob(f'{imdir_test}/*/*')]
# generate data for annotations.json
# {'image-file.jpg': ['label1.jpg']}
annotations_train = dict((str(p), [f'{p.parts[-2]}.jpg']) for p in train)
annotations_test = dict((str(p), [f'{p.parts[-2]}.jpg']) for p in test)
train_path = dataset_dir / 'annotations_train.json'
test_path = dataset_dir / 'annotations_test.json'
with open(train_path, 'w') as f:
json.dump(annotations_train, f)
with open(test_path, 'w') as f:
json.dump(annotations_test, f)
print("Done")
return train_path, test_path
#export
def get_oxford_102_flowers(output_dir: Path):
"""
Download the oxford flowers dataset.
"""
output_dir = Path(output_dir)
dataset_dir = output_dir / 'oxford-102-flowers'
_download_url(url='https://s3.amazonaws.com/fast-ai-imageclas/',
root=output_dir, filename="oxford-102-flowers.tgz",
file_hash=("sha256:680a253086535b2c800aada76a45fc1"
"89d3dcae4da6da8db36ce6a95f00bf4ad"))
if not dataset_dir.is_dir():
_extract_tar(output_dir / 'oxford-102-flowers.tgz', output_dir)
else:
print((f'Directory {dataset_dir} already'
' exists, skip extraction.'))
print('Generating train/test data..')
with open(dataset_dir / 'train.txt', 'r') as f:
# https://github.com/python/mypy/issues/7558
_annotations_train: Dict[str, Any] = dict(tuple(
line.split()) for line in f) # type: ignore
annotations_train: Dict[str, List[str]] = {
str(dataset_dir / k): [v + '.jpg'] for k, v in _annotations_train.items()
}
with open(dataset_dir / 'test.txt', 'r') as f:
_annotations_test: Dict[str, Any] = dict(tuple(
line.split()) for line in f) # type: ignore
annotations_test: Dict[str, List[str]] = {
str(dataset_dir / k): [v + '.jpg'] for k, v in _annotations_test.items()
}
train_path = dataset_dir / 'annotations_train.json'
test_path = dataset_dir / 'annotations_test.json'
with open(train_path, 'w') as f:
json.dump(annotations_train, f)
with open(test_path, 'w') as f:
json.dump(annotations_test, f)
print("Done")
return train_path, test_path
#export
def get_cub_200_2011(output_dir: Path):
"""
Download the CUB 200 2001 dataset.
"""
output_dir = Path(output_dir)
dataset_dir = output_dir / 'CUB_200_2011'
_download_url(url='https://s3.amazonaws.com/fast-ai-imageclas/',
root=output_dir, filename='CUB_200_2011.tgz',
file_hash=("sha256:0c685df5597a8b24909f6a7c9db6"
"d11e008733779a671760afef78feb49bf081"))
if not dataset_dir.is_dir():
_extract_tar(output_dir / 'CUB_200_2011.tgz', output_dir)
else:
print(f'Directory {dataset_dir} already exists, skip extraction.')
print('Generating train/test data..')
with open(dataset_dir / 'images.txt', 'r') as f:
image_id_map: Dict[str, Any] = dict(tuple(
line.split()) for line in f) # type: ignore
with open(dataset_dir / 'classes.txt', 'r') as f:
class_id_map: Dict[str, Any] = dict(tuple(
line.split()) for line in f) # type: ignore
with open(dataset_dir / 'train_test_split.txt', 'r') as f:
splitter: Dict[str, Any] = dict(tuple(
line.split()) for line in f) # type: ignore
# image ids for test/train
train_k = [k for k, v in splitter.items() if v == '0']
test_k = [k for k, v in splitter.items() if v == '1']
with open(dataset_dir / 'image_class_labels.txt', 'r') as f:
anno_: Dict[str, Any] = dict(tuple(
line.split()) for line in f) # type: ignore
annotations_train = {
str(dataset_dir / 'images' / image_id_map[k]):
[class_id_map[v] + '.jpg'] for k, v in anno_.items() if k in train_k}
annotations_test = {
str(dataset_dir / 'images' / image_id_map[k]):
[class_id_map[v] + '.jpg'] for k, v in anno_.items() if k in test_k}
train_path = dataset_dir / 'annotations_train.json'
test_path = dataset_dir / 'annotations_test.json'
with open(train_path, 'w') as f:
json.dump(annotations_train, f)
with open(test_path, 'w') as f:
json.dump(annotations_test, f)
print("Done")
return train_path, test_path