dataset convert parquet file to nii.gz file ¶
By: junqiangchen on June 10, 2025, 12:59 p.m.
hi,i have used the snapshot_download tool to download the whole parquet files,but i don't known how to convert parquet files to nii.gz files.
By: junqiangchen on June 10, 2025, 12:59 p.m.
hi,i have used the snapshot_download tool to download the whole parquet files,but i don't known how to convert parquet files to nii.gz files.
By: junqiangchen on June 10, 2025, 1:59 p.m.
`from pathlib import Path import pandas as pd import numpy as np import torchio as tio from tqdm import tqdm
def deep_unpack(arr): # 递归展开 object 数组,最终返回普通的 numpy 数组 if isinstance(arr, np.ndarray) and arr.dtype == object: if arr.size == 1: # 单元素,递归解包 return deep_unpack(arr[0]) else: # 多元素,递归解包每个元素,返回 list,再转 np.array return np.array([deep_unpack(a) for a in arr]) else: # 非 object 或普通 numpy 数组,直接返回 return arr
def convert_nii_gz(inputfiledir, outputdir): # ----------------------- 配置项 ----------------------- parquet_dir = Path(inputfiledir) # 替换为你的 .parquet 文件目录 output_root = Path(outputdir) # 替换为你保存 .nii.gz 的目录 config = "unilateral" # "default" 或 "unilateral"
# 配置路径规则 dir_config = { "default": {"data": "data", "metadata": "metadata"}, "unilateral": {"data": "data_unilateral", "metadata": "metadata_unilateral"}, } # 创建输出路径 output_root.mkdir(parents=True, exist_ok=True) # ------------------ 处理所有 .parquet 文件 ------------------ parquet_files = sorted(parquet_dir.glob("*.parquet")) all_metadata = [] for file_path in tqdm(parquet_files, desc="Processing .parquet files"): print(f"\n📂 正在处理: {file_path.name}") df = pd.read_parquet(file_path, engine="pyarrow") for idx, row in tqdm(df.iterrows(), total=len(df), desc="Saving images"): uid = row["UID"] institution = row["Institution"] # 获取图像名称 img_names = [col.split("Image_")[1] for col in row.index if col.startswith("Image_")] # 输出路径 path_folder = output_root / institution / dir_config[config]["data"] / uid path_folder.mkdir(parents=True, exist_ok=True) item_metadata = { "UID": uid, "Institution": institution, "Split": row.get("Split", "unknown"), "Fold": row.get("Fold", -1), } for img_name in img_names: img_key = f"Image_{img_name}" affine_key = f"Affine_{img_name}" img_data = row[img_key] img_affine = row[affine_key] if img_data is None or affine_key not in row: continue img_np = deep_unpack(img_data) affine_np = deep_unpack(img_affine) print(f"🔍 正在处理 UID: {uid}, 图像: {img_name}, 数据类型: {type(img_data)}") print(f"📏 图像 shape: {img_np.shape}, dtype: {img_np.dtype}") print(f"📏 affine shape: {affine_np.shape}, dtype: {img_affine.dtype}") print(f"🔬 图像样本类型: {type(img_np.flat[0])}") # 检查元素 if img_np.dtype == object: print("⚠️ 图像是 object 类型数组,内部结构可能不规则") try: img_np = np.array(img_np, dtype=np.int16) affine_np = np.array(affine_np, dtype=np.float64) if img_np.ndim == 3: img_np = np.expand_dims(img_np, axis=0) img_tensor = tio.ScalarImage(tensor=img_np, affine=affine_np) img_tensor.save(path_folder / f"{img_name}.nii.gz") else: print(f"❗跳过 UID: {uid} 图像 {img_name},维度不正确: {img_np.shape}") except Exception as e: print(f"❌ 图像转换失败 UID: {uid}, 错误: {e}") all_metadata.append(item_metadata) # ------------------ 保存标签 ------------------ df_all = pd.DataFrame(all_metadata) for institution in df_all["Institution"].unique(): df_inst = df_all[df_all["Institution"] == institution] path_metadata = output_root / institution / dir_config[config]["metadata"] path_metadata.mkdir(parents=True, exist_ok=True) df_anno = df_inst.drop(columns=["Institution", "Split", "Fold"]) df_anno.to_csv(path_metadata / "annotation.csv", index=False) df_split = df_inst[["UID", "Split", "Fold"]] df_split.to_csv(path_metadata / "split.csv", index=False) print("✅ 所有图像与标签保存完成!")
if name == "main": convert_nii_gz(r"E:\MedicalData\2025ODELIA\download\unilateral", r"E:\MedicalData\2025ODELIA\dataset/unilateral") convert_nii_gz(r"E:\MedicalData\2025ODELIA\download\data", r"E:\MedicalData\2025ODELIA\dataset/data") `
By: NickPayne on June 11, 2025, 9:46 a.m.
Hi - I'm glad you've been able to download the dataset.
The code provided on HuggingFace should handle the conversion you need. I see you've adapted the script a little, I suggest you go through your changes carefully and track down what's causing the error.