From 57acfd645fade756a8e1b7005efadaf6c268acce Mon Sep 17 00:00:00 2001 From: gouhanke <12219217+gouhanke@user.noreply.gitee.com> Date: Tue, 3 Feb 2026 14:18:30 +0800 Subject: [PATCH] =?UTF-8?q?feat(vla):=20vla=E6=A1=86=E6=9E=B6=E5=88=9D?= =?UTF-8?q?=E5=A7=8B=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.en.md | 36 ----- README.md | 150 +++++++++++++++++---- roboimi/demos/vla_scripts/train_vla.py | 45 +++++++ roboimi/vla/__init__.py | 1 + roboimi/vla/agent.py | 73 ++++++++++ roboimi/vla/conf/agent/default.yaml | 30 +++++ roboimi/vla/conf/agent/tiny.yaml | 1 + roboimi/vla/conf/backbone/clip.yaml | 1 + roboimi/vla/conf/backbone/siglip.yaml | 4 + roboimi/vla/conf/config.yaml | 12 ++ roboimi/vla/conf/data/default_dataset.yaml | 16 +++ roboimi/vla/conf/head/act.yaml | 1 + roboimi/vla/conf/head/diffusion.yaml | 8 ++ roboimi/vla/conf/projector/mlp.yaml | 6 + roboimi/vla/conf/projector/perceiver.yaml | 0 roboimi/vla/conf/train/debug.yaml | 1 + roboimi/vla/conf/train/gpu.yaml | 1 + roboimi/vla/core/__init__.py | 0 roboimi/vla/core/base_policy.py | 1 + roboimi/vla/core/base_vlm.py | 1 + roboimi/vla/data/__init__.py | 0 roboimi/vla/data/dataset.py | 88 ++++++++++++ roboimi/vla/data/image_transforms.py | 1 + roboimi/vla/data/text_processing.py | 1 + roboimi/vla/models/backbones/__init__.py | 6 + roboimi/vla/models/backbones/clip.py | 1 + roboimi/vla/models/backbones/dinov2.py | 1 + roboimi/vla/models/backbones/siglip.py | 1 + roboimi/vla/models/heads/__init__.py | 5 + roboimi/vla/models/heads/act.py | 1 + roboimi/vla/models/heads/diffusion.py | 1 + roboimi/vla/models/projectors/__init__.py | 5 + roboimi/vla/models/projectors/mlp.py | 1 + roboimi/vla/models/projectors/perceiver.py | 1 + roboimi/vla/modules/__init__.py | 0 roboimi/vla/modules/encoders.py | 1 + roboimi/vla/modules/fusion.py | 1 + roboimi/vla/scripts/convert_to_hdf5.py | 1 + roboimi/vla/scripts/download_weights.py | 1 + roboimi/vla/scripts/visualize_data.py | 1 + 40 files changed, 443 insertions(+), 63 deletions(-) delete mode 100644 README.en.md create mode 100644 roboimi/demos/vla_scripts/train_vla.py create mode 100644 roboimi/vla/__init__.py create mode 100644 roboimi/vla/agent.py create mode 100644 roboimi/vla/conf/agent/default.yaml create mode 100644 roboimi/vla/conf/agent/tiny.yaml create mode 100644 roboimi/vla/conf/backbone/clip.yaml create mode 100644 roboimi/vla/conf/backbone/siglip.yaml create mode 100644 roboimi/vla/conf/config.yaml create mode 100644 roboimi/vla/conf/data/default_dataset.yaml create mode 100644 roboimi/vla/conf/head/act.yaml create mode 100644 roboimi/vla/conf/head/diffusion.yaml create mode 100644 roboimi/vla/conf/projector/mlp.yaml create mode 100644 roboimi/vla/conf/projector/perceiver.yaml create mode 100644 roboimi/vla/conf/train/debug.yaml create mode 100644 roboimi/vla/conf/train/gpu.yaml create mode 100644 roboimi/vla/core/__init__.py create mode 100644 roboimi/vla/core/base_policy.py create mode 100644 roboimi/vla/core/base_vlm.py create mode 100644 roboimi/vla/data/__init__.py create mode 100644 roboimi/vla/data/dataset.py create mode 100644 roboimi/vla/data/image_transforms.py create mode 100644 roboimi/vla/data/text_processing.py create mode 100644 roboimi/vla/models/backbones/__init__.py create mode 100644 roboimi/vla/models/backbones/clip.py create mode 100644 roboimi/vla/models/backbones/dinov2.py create mode 100644 roboimi/vla/models/backbones/siglip.py create mode 100644 roboimi/vla/models/heads/__init__.py create mode 100644 roboimi/vla/models/heads/act.py create mode 100644 roboimi/vla/models/heads/diffusion.py create mode 100644 roboimi/vla/models/projectors/__init__.py create mode 100644 roboimi/vla/models/projectors/mlp.py create mode 100644 roboimi/vla/models/projectors/perceiver.py create mode 100644 roboimi/vla/modules/__init__.py create mode 100644 roboimi/vla/modules/encoders.py create mode 100644 roboimi/vla/modules/fusion.py create mode 100644 roboimi/vla/scripts/convert_to_hdf5.py create mode 100644 roboimi/vla/scripts/download_weights.py create mode 100644 roboimi/vla/scripts/visualize_data.py diff --git a/README.en.md b/README.en.md deleted file mode 100644 index 024238a..0000000 --- a/README.en.md +++ /dev/null @@ -1,36 +0,0 @@ -# robo-imi-act - -#### Description -{**When you're done, you can delete the content in this README and update the file with details for others getting started with your repository**} - -#### Software Architecture -Software architecture description - -#### Installation - -1. xxxx -2. xxxx -3. xxxx - -#### Instructions - -1. xxxx -2. xxxx -3. xxxx - -#### Contribution - -1. Fork the repository -2. Create Feat_xxx branch -3. Commit your code -4. Create Pull Request - - -#### Gitee Feature - -1. You can use Readme\_XXX.md to support different languages, such as Readme\_en.md, Readme\_zh.md -2. Gitee blog [blog.gitee.com](https://blog.gitee.com) -3. Explore open source project [https://gitee.com/explore](https://gitee.com/explore) -4. The most valuable open source project [GVP](https://gitee.com/gvp) -5. The manual of Gitee [https://gitee.com/help](https://gitee.com/help) -6. The most popular members [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/) diff --git a/README.md b/README.md index b72b112..67cf43d 100644 --- a/README.md +++ b/README.md @@ -1,39 +1,135 @@ -# robo-imi-act +# VLA Framework: Vision-Language-Action Policy Framework -#### 介绍 -{**以下是 Gitee 平台说明,您可以替换此简介** -Gitee 是 OSCHINA 推出的基于 Git 的代码托管平台(同时支持 SVN)。专为开发者提供稳定、高效、安全的云端软件开发协作平台 -无论是个人、团队、或是企业,都能够用 Gitee 实现代码托管、项目管理、协作开发。企业项目请看 [https://gitee.com/enterprises](https://gitee.com/enterprises)} +**VLA Framewrok** 是 `roboimi` 生态系统中的下一代具身智能策略框架。它采用**完全解耦**与**基于组合**的架构设计,支持视觉语言模型(VLM)、投影层(Projector)与动作生成头(Action Head)的灵活搭配。 -#### 软件架构 -软件架构说明 +本框架基于 [Hydra](https://hydra.cc/) 进行配置管理,并采用 HDF5 作为标准数据格式。 +--- -#### 安装教程 +## 🏗 架构概览 (Directory Structure) -1. xxxx -2. xxxx -3. xxxx +我们采用“接口与实现分离”以及“代码与配置镜像映射”的设计原则。 -#### 使用说明 +```text +roboimi/vla/ +├── agent.py # [Core] VLAAgent 组装类,负责串联各个模块 +├── conf/ # [Config] Hydra 配置文件 (单一真值源) +│ ├── config.yaml # 主入口配置 +│ ├── agent/ # Agent 结构定义 (定义模块间的连接与插值) +│ ├── backbone/ # 视觉骨干配置 (e.g., SigLIP, CLIP) +│ ├── projector/ # 投影层配置 (e.g., MLP, Perceiver) +│ ├── head/ # 动作头配置 (e.g., Diffusion, ACT) +│ └── data/ # 数据流配置 +├── core/ # [Interface] 抽象基类 +│ ├── base_vlm.py # VLMBackbone (ABC) +│ └── base_policy.py # ActionHead (ABC) +├── models/ # [Implementation] 具体模型实现 +│ ├── backbones/ # 视觉模型 (Sub-package) +│ ├── projectors/ # 投影层 (Sub-package) +│ └── heads/ # 策略头 (Sub-package) +├── data/ # [Data Pipeline] Dataset 与 DataLoader +├── modules/ # [Building Blocks] 通用组件 (Encoders, Fusion) +└── scripts/ # [Utilities] 数据转换与维护脚本 +``` -1. xxxx -2. xxxx -3. xxxx +--- -#### 参与贡献 +## 🚀 快速开始 (Quick Start) -1. Fork 本仓库 -2. 新建 Feat_xxx 分支 -3. 提交代码 -4. 新建 Pull Request +### 1. 环境依赖 +请确保安装以下核心库: +```bash +pip install hydra-core h5py zarr diffusers transformers +``` +### 2. 启动训练 (Training) +训练入口脚本通常位于 `demos/vla_scripts/train_vla.py`。 +由于使用了 Hydra,您可以在命令行动态组合模型架构: -#### 特技 +```bash +# 1. 默认训练 (SigLIP + MLP + Diffusion) +python demos/vla_scripts/train_vla.py -1. 使用 Readme\_XXX.md 来支持不同的语言,例如 Readme\_en.md, Readme\_zh.md -2. Gitee 官方博客 [blog.gitee.com](https://blog.gitee.com) -3. 你可以 [https://gitee.com/explore](https://gitee.com/explore) 这个地址来了解 Gitee 上的优秀开源项目 -4. [GVP](https://gitee.com/gvp) 全称是 Gitee 最有价值开源项目,是综合评定出的优秀开源项目 -5. Gitee 官方提供的使用手册 [https://gitee.com/help](https://gitee.com/help) -6. Gitee 封面人物是一档用来展示 Gitee 会员风采的栏目 [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/) +# 2. 切换视觉骨干为 CLIP +python demos/vla_scripts/train_vla.py agent/backbone=clip + +# 3. 切换投影层为 Perceiver Resampler +python demos/vla_scripts/train_vla.py agent/projector=perceiver + +# 4. 修改超参数 (例如 batch size) +python demos/vla_scripts/train_vla.py train.batch_size=32 + +# 5. 调试模式 (使用 Tiny 模型快速跑通流程) +python demos/vla_scripts/train_vla.py agent=tiny +``` + +--- + +## 🛠 开发指南 (Developer Guide) + +### 1. 添加新的视觉骨干 (New Backbone) +1. **代码**: 在 `models/backbones/` 下新建文件 (如 `my_model.py`),继承 `VLMBackbone`。 +2. **导出**: 在 `models/backbones/__init__.py` 中添加导出。 +3. **配置**: 在 `conf/backbone/` 下新建 `my_model.yaml`。 + * *注意*: 必须定义 `output_dim`,供 Projector 引用。 + +### 2. 添加新的投影层 (New Projector) +Projector 负责将 VLM 特征维度对齐到 Agent 的 Embedding 维度。 +1. **代码**: 在 `models/projectors/` 下实现 `nn.Module`。 +2. **配置**: 在 `conf/projector/` 下新建 YAML 文件。 + * *关键*: 设置 `input_dim: ???` 和 `output_dim: ???`,让 Hydra 在 `agent/default.yaml` 中自动插值填充。 + +### 3. 添加新的动作头 (New Action Head) +1. **代码**: 在 `models/heads/` 下新建文件,继承 `ActionHead`。 + * 必须实现 `compute_loss(context, actions)` 和 `predict_action(context)`。 +2. **配置**: 在 `conf/head/` 下新建 YAML 文件。 + * 同样建议设置 `input_dim: ???` 以保持动态性。 + +--- + +## 📊 数据流水线 (Data Pipeline) + +本框架强制使用 **HDF5** 格式以优化 IO 性能。 + +### 1. 数据结构标准 +数据集必须遵循 [Robomimic](https://robomimic.github.io/) 的层级结构: +```text +dataset.hdf5 +├── data/ +│ ├── demo_0/ +│ │ ├── obs/ +│ │ │ ├── agentview_rgb # (T, H, W, 3) uint8 +│ │ │ └── qpos # (T, D) float32 +│ │ ├── actions # (T, D) float32 +│ │ └── language # (Attribute) String 指令 +│ └── ... +``` + +### 2. 数据转换工具 +使用内置脚本将您的原始数据转换为标准 HDF5: + +```bash +# 在项目根目录下运行 +python -m roboimi.vla.scripts.convert_to_hdf5 \ + --input_dir /path/to/raw/images \ + --output_path ./data/demo.hdf5 +``` + +### 3. 调试数据 +如果不确定数据是否正确,使用可视化工具检查: +```bash +python -m roboimi.vla.scripts.visualize_data --dataset ./data/demo.hdf5 +``` + +--- + +## ⚠️ 最佳实践 (Best Practices) + +1. **绝对导入**: 禁止使用 `from . import xxx`。请始终使用全路径 `from roboimi.vla.models.backbones import SigLIPBackbone`。 +2. **Hydra 插值**: 在 `agent/default.yaml` 中,我们使用了 `${..embed_dim}` 语法来确保所有子模块的维度一致。**不要在子配置中硬编码维度数值。** +3. **HDF5 IO**: 在 `Dataset` 类中,**必须在 `__getitem__` 内部打开 HDF5 文件**。如果在 `__init__` 中打开,多进程 DataLoader 会因无法序列化文件句柄而报错。 +4. **接口导出**:每当在 `models/xxx/` 下添加新文件时,务必在对应的 `__init__.py` 中更新 `__all__`,以保持引用整洁。 + +--- + +*Maintainer: VLA Framework Team* \ No newline at end of file diff --git a/roboimi/demos/vla_scripts/train_vla.py b/roboimi/demos/vla_scripts/train_vla.py new file mode 100644 index 0000000..5ffe1c3 --- /dev/null +++ b/roboimi/demos/vla_scripts/train_vla.py @@ -0,0 +1,45 @@ +import hydra +from omegaconf import DictConfig, OmegaConf +from hydra.utils import instantiate +import torch +import os + +# 必须指向你的配置文件所在路径 +# config_path 是相对于当前脚本的路径,或者绝对路径 +# config_name 是不带 .yaml 后缀的主文件名 +@hydra.main(version_base=None, config_path="../../roboimi/vla/conf", config_name="config") +def main(cfg: DictConfig): + print(f"Working directory : {os.getcwd()}") + print(f"Configuration:\n{OmegaConf.to_yaml(cfg)}") + + # 1. 实例化 Agent + # Hydra 会自动查找 _target_ 并递归实例化 vlm_backbone 和 action_head + print(">>> Instantiating VLA Agent...") + agent = instantiate(cfg.agent) + + # 将模型移至 GPU + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + agent.to(device) + print(f">>> Agent created successfully. Backbone: {type(agent.vlm).__name__}") + + # 2. 实例化 DataLoader (假设你也为 Data 写了 yaml) + # 实例化 Dataset + dataset = hydra.utils.instantiate(cfg.data) + + # 封装进 DataLoader + dataloader = torch.utils.data.DataLoader( + dataset, + batch_size=cfg.train.batch_size, + shuffle=True, + num_workers=4 + ) + + # 3. 实例化 Optimizer (Hydra 也支持 partial 实例化) + # optimizer = instantiate(cfg.train.optimizer, params=agent.parameters()) + + # 4. 模拟训练循环 + print(f">>> Starting training with batch size: {cfg.train.batch_size}") + # ... training loop logic here ... + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/roboimi/vla/__init__.py b/roboimi/vla/__init__.py new file mode 100644 index 0000000..0509741 --- /dev/null +++ b/roboimi/vla/__init__.py @@ -0,0 +1 @@ +# export VLAAgent, VLAModelConfig diff --git a/roboimi/vla/agent.py b/roboimi/vla/agent.py new file mode 100644 index 0000000..6009b90 --- /dev/null +++ b/roboimi/vla/agent.py @@ -0,0 +1,73 @@ +# roboimi/vla/agent.py + +import torch +import torch.nn as nn +from typing import Optional, Dict, Union + +class VLAAgent(nn.Module): + def __init__(self, + vlm_backbone: nn.Module, + img_projector: nn.Module, + action_head: nn.Module, + state_dim: int, + embed_dim: int): + super().__init__() + self.vlm_backbone = vlm_backbone + self.img_projector = img_projector + self.action_head = action_head + + # 简单的状态编码器 (通常不需要复杂的 config,直接写在这里即可) + self.state_encoder = nn.Sequential( + nn.Linear(state_dim, embed_dim), + nn.Mish(), + nn.Linear(embed_dim, embed_dim) + ) + + def forward(self, + images: torch.Tensor, + state: torch.Tensor, + text: Optional[Union[str, list]] = None, + actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]: + """ + Args: + images: [Batch, Obs_Horizon, C, H, W] 注意: 这里需要处理时间维度 + state: [Batch, Obs_Horizon, State_Dim] + text: Optional text instructions + actions: [Batch, Pred_Horizon, Action_Dim] (Training only) + + Returns: + Training: Loss scalar + Inference: Predicted actions + """ + + B, T, C, H, W = images.shape + + # 1. 图像编码 (Flatten time dimension for efficiency) + # [B*T, C, H, W] -> [B*T, Vision_Dim] + flat_images = images.view(B * T, C, H, W) + vision_feats_dict = self.vlm_backbone(flat_images) + raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim] + + # 投影并还原时间维度 -> [B, T, Embed_Dim] + img_emb = self.img_projector(raw_img_emb) + img_emb = img_emb.view(B, T, -1) + + # 2. 状态编码 + state_emb = self.state_encoder(state) # [B, T, Embed_Dim] + + # 3. 特征融合 (这里做一个简单的 Early Fusion 示例) + # 将图像特征和状态特征在特征维度拼接,或在时间维度拼接 + # 假设我们只用最近的一帧图像作为 Context,或者将所有历史特征作为 Context + # 这里演示:Context = (Image_History + State_History) + # [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time) + context = torch.cat([img_emb, state_emb], dim=1) + + # 4. Action Head 分支 + if actions is not None: + # --- Training Mode --- + # 必须返回 Loss + return self.action_head.compute_loss(context, actions) + else: + # --- Inference Mode --- + # 必须返回预测的动作序列 + return self.action_head.predict_action(context) \ No newline at end of file diff --git a/roboimi/vla/conf/agent/default.yaml b/roboimi/vla/conf/agent/default.yaml new file mode 100644 index 0000000..9ddde09 --- /dev/null +++ b/roboimi/vla/conf/agent/default.yaml @@ -0,0 +1,30 @@ +# @package _global_ +defaults: + # 1. 将 backbone 配置挂载到 agent.vlm_backbone 节点 + - /backbone@vlm_backbone: siglip + + # 2. 将 projector 配置挂载到 agent.img_projector 节点 (新增) + - /projector@img_projector: mlp + + # 3. 将 head 配置挂载到 agent.action_head 节点 + - /head@action_head: diffusion + + # 4. 允许当前文件覆盖上述配置 + - _self_ + +_target_: roboimi.vla.agent.VLAAgent + +# 核心超参数:单一真值源 +state_dim: 14 +embed_dim: 512 + +# --- 参数一致性绑定 (Interpolation) --- + +# 强制 Projector 输出维度 = Agent 嵌入维度 +img_projector: + input_dim: ${..vlm_backbone.output_dim} # 自动获取 backbone 的输出维度 + output_dim: ${..embed_dim} # 引用上方的 embed_dim + +# 强制 Head 输入维度 = Agent 嵌入维度 +action_head: + input_dim: ${..embed_dim} # 引用上方的 embed_dim \ No newline at end of file diff --git a/roboimi/vla/conf/agent/tiny.yaml b/roboimi/vla/conf/agent/tiny.yaml new file mode 100644 index 0000000..6a3bda1 --- /dev/null +++ b/roboimi/vla/conf/agent/tiny.yaml @@ -0,0 +1 @@ +# 调试用小模型 diff --git a/roboimi/vla/conf/backbone/clip.yaml b/roboimi/vla/conf/backbone/clip.yaml new file mode 100644 index 0000000..b6cf693 --- /dev/null +++ b/roboimi/vla/conf/backbone/clip.yaml @@ -0,0 +1 @@ +# CLIP Backbone 配置 diff --git a/roboimi/vla/conf/backbone/siglip.yaml b/roboimi/vla/conf/backbone/siglip.yaml new file mode 100644 index 0000000..306bd12 --- /dev/null +++ b/roboimi/vla/conf/backbone/siglip.yaml @@ -0,0 +1,4 @@ +_target_: roboimi.vla.models.backbones.SigLIPBackbone +model_name: "google/siglip-so400m-patch14-384" +frozen: true +output_dim: 1152 # SigLIP Large 的特征维度,需显式声明供 Projector 引用 \ No newline at end of file diff --git a/roboimi/vla/conf/config.yaml b/roboimi/vla/conf/config.yaml new file mode 100644 index 0000000..a203c26 --- /dev/null +++ b/roboimi/vla/conf/config.yaml @@ -0,0 +1,12 @@ +defaults: + - _self_ + - agent: default # 所有的子模块选择都在 agent/default.yaml 中完成了 + - data: default_dataset + - train: gpu + +project_name: "vla_frame_refactored" +seed: 42 + +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} \ No newline at end of file diff --git a/roboimi/vla/conf/data/default_dataset.yaml b/roboimi/vla/conf/data/default_dataset.yaml new file mode 100644 index 0000000..6b52e13 --- /dev/null +++ b/roboimi/vla/conf/data/default_dataset.yaml @@ -0,0 +1,16 @@ +_target_: roboimi.vla.data.dataset.VLADataset +dataset_dir: "/path/to/your/roboimi/demos/dataset/collected_data" +pred_horizon: 16 +obs_horizon: 2 + +# 这里展示了 Hydra 的嵌套实例化:Transform 作为参数传入 +transform: + _target_: roboimi.vla.data.image_transforms.VLAImageProcessor + size: [224, 224] + mean: [0.5, 0.5, 0.5] # SigLIP/CLIP 常用归一化 + std: [0.5, 0.5, 0.5] + +# 如果需要 Tokenizer +tokenizer: null +# _target_: roboimi.vla.data.text_processing.SimpleTokenizer +# max_length: 77 \ No newline at end of file diff --git a/roboimi/vla/conf/head/act.yaml b/roboimi/vla/conf/head/act.yaml new file mode 100644 index 0000000..e4ecbb0 --- /dev/null +++ b/roboimi/vla/conf/head/act.yaml @@ -0,0 +1 @@ +# ACT-VAE Head 配置 diff --git a/roboimi/vla/conf/head/diffusion.yaml b/roboimi/vla/conf/head/diffusion.yaml new file mode 100644 index 0000000..a442fe5 --- /dev/null +++ b/roboimi/vla/conf/head/diffusion.yaml @@ -0,0 +1,8 @@ +_target_: roboimi.vla.models.heads.DiffusionActionHead + +# 显式声明必填参数 +input_dim: ??? # 【修复】必须存在,等待 agent/default.yaml 填充 +action_dim: 7 +obs_horizon: 2 +pred_horizon: 16 +denoising_steps: 100 \ No newline at end of file diff --git a/roboimi/vla/conf/projector/mlp.yaml b/roboimi/vla/conf/projector/mlp.yaml new file mode 100644 index 0000000..d59eda2 --- /dev/null +++ b/roboimi/vla/conf/projector/mlp.yaml @@ -0,0 +1,6 @@ +_target_: roboimi.vla.models.projectors.MLPProjector + +input_dim: ??? # 【修复】等待插值 +output_dim: ??? # 【修复】等待插值 +hidden_dim: 1024 +dropout: 0.1 \ No newline at end of file diff --git a/roboimi/vla/conf/projector/perceiver.yaml b/roboimi/vla/conf/projector/perceiver.yaml new file mode 100644 index 0000000..e69de29 diff --git a/roboimi/vla/conf/train/debug.yaml b/roboimi/vla/conf/train/debug.yaml new file mode 100644 index 0000000..3a8f68f --- /dev/null +++ b/roboimi/vla/conf/train/debug.yaml @@ -0,0 +1 @@ +# Debug 训练超参数 diff --git a/roboimi/vla/conf/train/gpu.yaml b/roboimi/vla/conf/train/gpu.yaml new file mode 100644 index 0000000..5f39934 --- /dev/null +++ b/roboimi/vla/conf/train/gpu.yaml @@ -0,0 +1 @@ +# GPU 训练超参数 diff --git a/roboimi/vla/core/__init__.py b/roboimi/vla/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/roboimi/vla/core/base_policy.py b/roboimi/vla/core/base_policy.py new file mode 100644 index 0000000..b262417 --- /dev/null +++ b/roboimi/vla/core/base_policy.py @@ -0,0 +1 @@ +# define ActionHead(ABC) diff --git a/roboimi/vla/core/base_vlm.py b/roboimi/vla/core/base_vlm.py new file mode 100644 index 0000000..e785c85 --- /dev/null +++ b/roboimi/vla/core/base_vlm.py @@ -0,0 +1 @@ +# define VLMBackbone(ABC) diff --git a/roboimi/vla/data/__init__.py b/roboimi/vla/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/roboimi/vla/data/dataset.py b/roboimi/vla/data/dataset.py new file mode 100644 index 0000000..43bdd53 --- /dev/null +++ b/roboimi/vla/data/dataset.py @@ -0,0 +1,88 @@ +import h5py +import torch +import numpy as np +from torch.utils.data import Dataset + +class VLAHDF5Dataset(Dataset): + def __init__(self, + dataset_path: str, + pred_horizon: int = 16, + obs_horizon: int = 2, + transform=None): + self.dataset_path = dataset_path + self.pred_horizon = pred_horizon + self.obs_horizon = obs_horizon + self.transform = transform + + # 1. 在初始化时,我们只读取数据的“元数据”(形状、长度),不加载内容 + # 这一步很快,不会占用内存 + with h5py.File(self.dataset_path, 'r') as root: + self.demo_keys = list(root['data'].keys()) + + # 构建索引表:(demo_key, start_time) + self.indices = [] + for key in self.demo_keys: + demo = root['data'][key] + L = demo['actions'].shape[0] + # 遍历该轨迹的所有时刻 + for t in range(L): + self.indices.append((key, t)) + + def __len__(self): + return len(self.indices) + + def __getitem__(self, idx): + key, t_start = self.indices[idx] + + # 2. 【关键】在 __getitem__ 内部打开文件 + # 这确保了每个 DataLoader worker 都有自己独立的文件句柄 + with h5py.File(self.dataset_path, 'r') as root: + demo = root['data'][key] + + # 获取数据总长度 + L = demo['actions'].shape[0] + + # --- 读取动作 (Actions) --- + t_end = min(t_start + self.pred_horizon, L) + # HDF5 支持直接切片读取,非常快 + actions = demo['actions'][t_start : t_end] + + # 处理 Padding (如果动作不够长) + if len(actions) < self.pred_horizon: + # 转为 Tensor 处理 Padding + actions = torch.from_numpy(actions) + pad_len = self.pred_horizon - len(actions) + last_action = actions[-1].unsqueeze(0) + actions = torch.cat([actions, last_action.repeat(pad_len, 1)]) + action_mask = torch.cat([torch.ones(len(actions)-pad_len), torch.zeros(pad_len)]) + else: + actions = torch.from_numpy(actions) + action_mask = torch.ones(self.pred_horizon) + + # --- 读取图像 (Images) --- + # 处理历史观测 padding (如果 t_start < obs_horizon) + images_list = [] + for i in range(self.obs_horizon): + t_read = max(0, t_start - self.obs_horizon + 1 + i) + # 读取单帧 + img = demo['obs']['agentview_rgb'][t_read] + images_list.append(img) + + # Stack 并转为 Tensor: [T, H, W, C] -> [T, C, H, W] + images = np.stack(images_list) + images = torch.from_numpy(images).permute(0, 3, 1, 2).float() / 255.0 + + # --- 读取语言指令 --- + # 假设语言存储在 demo 的属性中 (Robomimic 风格) + lang_text = demo.attrs.get("model_file", "") # 或自定义字段 + + # 3. 应用图像增强 + if self.transform: + images = self.transform(images) + + return { + "images": images, + "text": lang_text, # 后续在 collate_fn 中处理 tokenize + "actions": actions, + "action_mask": action_mask + } \ No newline at end of file diff --git a/roboimi/vla/data/image_transforms.py b/roboimi/vla/data/image_transforms.py new file mode 100644 index 0000000..d1350a0 --- /dev/null +++ b/roboimi/vla/data/image_transforms.py @@ -0,0 +1 @@ +# 图像预处理 diff --git a/roboimi/vla/data/text_processing.py b/roboimi/vla/data/text_processing.py new file mode 100644 index 0000000..ecd3c3c --- /dev/null +++ b/roboimi/vla/data/text_processing.py @@ -0,0 +1 @@ +# 文本 Tokenizer 包装 diff --git a/roboimi/vla/models/backbones/__init__.py b/roboimi/vla/models/backbones/__init__.py new file mode 100644 index 0000000..b28dec3 --- /dev/null +++ b/roboimi/vla/models/backbones/__init__.py @@ -0,0 +1,6 @@ +# Backbone models +from .siglip import SigLIPBackbone +from .clip import CLIPBackbone +from .dinov2 import DinoV2Backbone + +__all__ = ["SigLIPBackbone", "CLIPBackbone", "DinoV2Backbone"] diff --git a/roboimi/vla/models/backbones/clip.py b/roboimi/vla/models/backbones/clip.py new file mode 100644 index 0000000..c30ac7f --- /dev/null +++ b/roboimi/vla/models/backbones/clip.py @@ -0,0 +1 @@ +# CLIP Backbone 实现 diff --git a/roboimi/vla/models/backbones/dinov2.py b/roboimi/vla/models/backbones/dinov2.py new file mode 100644 index 0000000..acba66c --- /dev/null +++ b/roboimi/vla/models/backbones/dinov2.py @@ -0,0 +1 @@ +# DinoV2 Backbone 实现 diff --git a/roboimi/vla/models/backbones/siglip.py b/roboimi/vla/models/backbones/siglip.py new file mode 100644 index 0000000..5fe0b9e --- /dev/null +++ b/roboimi/vla/models/backbones/siglip.py @@ -0,0 +1 @@ +# SigLIP Backbone 实现 diff --git a/roboimi/vla/models/heads/__init__.py b/roboimi/vla/models/heads/__init__.py new file mode 100644 index 0000000..9de0395 --- /dev/null +++ b/roboimi/vla/models/heads/__init__.py @@ -0,0 +1,5 @@ +# Action Head models +from .diffusion import DiffusionActionHead +from .act import ACTHead + +__all__ = ["DiffusionActionHead", "ACTHead"] diff --git a/roboimi/vla/models/heads/act.py b/roboimi/vla/models/heads/act.py new file mode 100644 index 0000000..1860fe4 --- /dev/null +++ b/roboimi/vla/models/heads/act.py @@ -0,0 +1 @@ +# ACT-VAE Action Head 实现 diff --git a/roboimi/vla/models/heads/diffusion.py b/roboimi/vla/models/heads/diffusion.py new file mode 100644 index 0000000..61168d4 --- /dev/null +++ b/roboimi/vla/models/heads/diffusion.py @@ -0,0 +1 @@ +# Diffusion Policy Action Head 实现 diff --git a/roboimi/vla/models/projectors/__init__.py b/roboimi/vla/models/projectors/__init__.py new file mode 100644 index 0000000..14ca3df --- /dev/null +++ b/roboimi/vla/models/projectors/__init__.py @@ -0,0 +1,5 @@ +# Projector models +from .mlp import MLPProjector +from .perceiver import PerceiverResampler + +__all__ = ["MLPProjector", "PerceiverResampler"] \ No newline at end of file diff --git a/roboimi/vla/models/projectors/mlp.py b/roboimi/vla/models/projectors/mlp.py new file mode 100644 index 0000000..0e7f7de --- /dev/null +++ b/roboimi/vla/models/projectors/mlp.py @@ -0,0 +1 @@ +# MLP Projector 实现 diff --git a/roboimi/vla/models/projectors/perceiver.py b/roboimi/vla/models/projectors/perceiver.py new file mode 100644 index 0000000..de29008 --- /dev/null +++ b/roboimi/vla/models/projectors/perceiver.py @@ -0,0 +1 @@ +# Perceiver Resampler 实现 diff --git a/roboimi/vla/modules/__init__.py b/roboimi/vla/modules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/roboimi/vla/modules/encoders.py b/roboimi/vla/modules/encoders.py new file mode 100644 index 0000000..0a5ba28 --- /dev/null +++ b/roboimi/vla/modules/encoders.py @@ -0,0 +1 @@ +# StateEncoder, ActionEncoder diff --git a/roboimi/vla/modules/fusion.py b/roboimi/vla/modules/fusion.py new file mode 100644 index 0000000..7e0bba3 --- /dev/null +++ b/roboimi/vla/modules/fusion.py @@ -0,0 +1 @@ +# TransformerFusion, FiLM diff --git a/roboimi/vla/scripts/convert_to_hdf5.py b/roboimi/vla/scripts/convert_to_hdf5.py new file mode 100644 index 0000000..4db4a47 --- /dev/null +++ b/roboimi/vla/scripts/convert_to_hdf5.py @@ -0,0 +1 @@ +# 将图片文件夹转为 HDF5 格式 diff --git a/roboimi/vla/scripts/download_weights.py b/roboimi/vla/scripts/download_weights.py new file mode 100644 index 0000000..18cc9c1 --- /dev/null +++ b/roboimi/vla/scripts/download_weights.py @@ -0,0 +1 @@ +# 下载预训练 VLM 权重 diff --git a/roboimi/vla/scripts/visualize_data.py b/roboimi/vla/scripts/visualize_data.py new file mode 100644 index 0000000..1a439cf --- /dev/null +++ b/roboimi/vla/scripts/visualize_data.py @@ -0,0 +1 @@ +# 检查 Dataset 读取是否正确