diff --git a/roboimi/demos/diana_eval.py b/roboimi/demos/diana_eval.py deleted file mode 100644 index e6994d4..0000000 --- a/roboimi/demos/diana_eval.py +++ /dev/null @@ -1,206 +0,0 @@ -import torch -import os -import numpy as np -import matplotlib.pyplot as plt -from tqdm import tqdm -from einops import rearrange -from roboimi.utils.utils import set_seed -from roboimi.utils.io_utils import IOUtils -from roboimi.utils.model_interface import ModelInterface -from roboimi.envs.double_pos_ctrl_env import make_sim_env -# from visualize_episodes import save_videos -from roboimi.utils.act_ex_utils import sample_transfer_pose - - -class ActionSmoother: - """ - 动作平滑器,支持多种平滑策略 - """ - def __init__(self, action_dim, method='ema', alpha=0.3, window_size=5): - """ - Args: - action_dim: 动作维度 - method: 平滑方法 ('ema', 'moving_avg', 'lowpass', 'none') - alpha: EMA 平滑系数 (0-1),越小越平滑 - window_size: 滑动窗口大小 - """ - self.action_dim = action_dim - self.method = method - self.alpha = alpha - self.window_size = window_size - self.history = [] - self.prev_action = None - - def smooth(self, action): - """ - 对动作进行平滑处理 - - Args: - action: 当前动作 [action_dim] - - Returns: - smoothed_action: 平滑后的动作 - """ - if self.method == 'none': - return action - - if self.method == 'ema': - # 指数移动平均 - if self.prev_action is None: - smoothed = action - else: - smoothed = self.alpha * action + (1 - self.alpha) * self.prev_action - self.prev_action = smoothed - return smoothed - - elif self.method == 'moving_avg': - # 滑动平均 - self.history.append(action.copy()) - if len(self.history) > self.window_size: - self.history.pop(0) - return np.mean(self.history, axis=0) - - elif self.method == 'lowpass': - # 一阶低通滤波器 - if self.prev_action is None: - smoothed = action - else: - smoothed = self.prev_action + self.alpha * (action - self.prev_action) - self.prev_action = smoothed - return smoothed - - else: - raise ValueError(f"Unknown smoothing method: {self.method}") - - def reset(self): - """重置平滑器状态""" - self.history = [] - self.prev_action = None - - -#should be added into IOUtils -def get_image(obs,camera_names): - curr_images = [] - for cam_name in camera_names: - curr_image = rearrange(obs['images'][cam_name], 'h w c -> c h w') - curr_images.append(curr_image) - curr_image = np.stack(curr_images, axis=0) - curr_image = torch.from_numpy(curr_image / 255.0).float().cuda().unsqueeze(0) - return curr_image - - -def eval_bc(config, ckpt_name='policy_best.ckpt', save_episode=True): - set_seed(1) - model_interface = ModelInterface(config) - model_interface.setup() - policy = IOUtils.load_policy(config, ckpt_name) - stats = IOUtils.load_stats(config['ckpt_dir']) - num_rollouts = 3 - episode_returns = [] - highest_rewards = [] - - - - - - run_episode(config, policy, stats, - save_episode,num_rollouts) - # episode_return, episode_highest_reward = run_episode(config, policy, stats, - # save_episode,num_rollouts) - - - - -def run_episode(config, policy, stats, save_episode,num_rollouts): - - if 'sim_transfer' in config['task_name']: - task_name = 'sim_transfer' #config['task_name'] - env = make_sim_env(task_name) - - max_timesteps = config['episode_len'] - max_timesteps = int(max_timesteps * 1) - pre_process = lambda s_qpos: (s_qpos - stats['qpos_mean']) / stats['qpos_std'] - post_process = lambda a: a * stats['action_std'] + stats['action_mean'] - box_pos = sample_transfer_pose() - - # 初始化动作平滑器 - action_dim = config['action_dim'] - use_smoothing = config.get('use_action_smoothing', False) - smooth_method = config.get('smooth_method', 'ema') - smooth_alpha = config.get('smooth_alpha', 0.3) - - if use_smoothing and config['policy_class'] == "GR00T": - smoother = ActionSmoother(action_dim, method=smooth_method, alpha=smooth_alpha) - print(f"Action smoothing enabled: method={smooth_method}, alpha={smooth_alpha}") - else: - smoother = None - - for rollout_id in range(num_rollouts): - print("\nrollout_id===",rollout_id,"\n") - image_list = [] - rewards = [] - query_frequency = config['policy_config'].get('num_queries', 1) - print("query_freq =====",query_frequency) - env.reset(box_pos) - - # 重置平滑器 - if smoother is not None: - smoother.reset() - - with torch.inference_mode(): - for t in range(700): - image_list.append(env._get_image_obs()['images'] if 'images' in env._get_image_obs() else {print("img error")}) - qpos_numpy = np.array(env._get_qpos_obs()['qpos']) - qpos = pre_process(qpos_numpy) - qpos = torch.from_numpy(qpos).float().cuda().unsqueeze(0) - curr_image = get_image(env._get_image_obs(), config['camera_names']) - if config['policy_class'] in ["ACT", "ACTTV", "GR00T"]: - if t % query_frequency == 0: - all_actions = policy(qpos, curr_image) - raw_action = all_actions[:, t % query_frequency] - raw_action = raw_action.squeeze(0).cpu().numpy() - elif config['policy_class'] == "CNNMLP": - raw_action = policy(qpos, curr_image) - else: - raise NotImplementedError - - - action = post_process(raw_action) - - # 应用动作平滑(仅对 GR00T) - if smoother is not None: - action = smoother.smooth(action) - - print("action == ",action) - env.step_jnt(action) - rewards.append(env.rew) - env.render() - - - rewards = np.array(rewards) - # episode_return = np.sum(rewards[rewards != None]) - # episode_highest_reward = np.max(rewards) - # env.viewer.close() - - # del env - # return episode_return, episode_highest_reward - - - - -def test_env(): - try: - env = make_sim_env('sim_transfer') - env.reset() - while True: pass - except KeyboardInterrupt: - del env - print("stop") - -if __name__ == '__main__': - # test_env() - io_utils = IOUtils() - config = io_utils.load_config() - eval_bc(config) - - diff --git a/roboimi/demos/eval.py b/roboimi/demos/eval.py deleted file mode 100644 index 792c81a..0000000 --- a/roboimi/demos/eval.py +++ /dev/null @@ -1,152 +0,0 @@ -import torch -import os -import numpy as np -import matplotlib.pyplot as plt -from tqdm import tqdm -from einops import rearrange -from roboimi.utils.utils import set_seed -from roboimi.utils.io_utils import IOUtils -from roboimi.utils.model_interface import ModelInterface -from roboimi.envs.vx300s_jnt import make_sim_env -import time - -# from visualize_episodes import save_videos -from roboimi.utils.utils import sample_box_pose, sample_insertion_pose - - - -#should be added into IOUtils -def get_image(obs,camera_names): - curr_images = [] - for cam_name in camera_names: - curr_image = rearrange(obs['images'][cam_name], 'h w c -> c h w') - curr_images.append(curr_image) - curr_image = np.stack(curr_images, axis=0) - curr_image = torch.from_numpy(curr_image / 255.0).float().cuda().unsqueeze(0) - return curr_image - - -def eval_bc(config, ckpt_name='policy_best.ckpt', save_episode=True): - set_seed(1) - model_interface = ModelInterface(config) - task_name = 'sim_insertion' #config['task_name'] - model_interface.setup() - policy = IOUtils.load_policy(config, ckpt_name) - stats = IOUtils.load_stats(config['ckpt_dir']) - num_rollouts = 3 - episode_returns = [] - highest_rewards = [] - for rollout_id in range(num_rollouts): - episode_return, episode_highest_reward = run_episode(config, policy, stats, - save_episode,rollout_id) - - - - -def run_episode(config, policy, stats, save_episode,rollout_id): - print("\nrollout_id===",rollout_id,"\n") - pre_process = lambda s_qpos: (s_qpos - stats['qpos_mean']) / stats['qpos_std'] - post_process = lambda a: a * stats['action_std'] + stats['action_mean'] - if 'sim_insertion' in config['task_name']: - peg_pose, socket_pose = sample_insertion_pose() - box_pose = np.hstack((peg_pose[:3],socket_pose[:3])) # used in sim reset - task_name = 'sim_insertion' #config['task_name'] - env = make_sim_env(task_name) - env.reset(box_pose) - max_timesteps = config['episode_len'] - max_timesteps = int(max_timesteps * 1) - - image_list = [] - rewards = [] - query_frequency = config['policy_config'].get('num_queries', 1) - - with torch.inference_mode(): - for t in range(700): - # print("obs_img",env.obs['images']) - image_list.append(env.obs['images'] if 'images' in env.obs else {print("img error")}) - qpos_numpy = np.array(env.obs['qpos']) - qpos = pre_process(qpos_numpy) - qpos = torch.from_numpy(qpos).float().cuda().unsqueeze(0) - curr_image = get_image(env.obs, config['camera_names']) - if config['policy_class'] == "ACT" or "ACTTV": - if t % query_frequency == 0: - all_actions = policy(qpos, curr_image) - elif config['policy_class'] == "CNNMLP": - raw_action = policy(qpos, curr_image) - else: - raise NotImplementedError - raw_action = all_actions[:, t % query_frequency] - raw_action = raw_action.squeeze(0).cpu().numpy() - action = post_process(raw_action) - - env.step(action) - rewards.append(env.rew) - env.render() - - - rewards = np.array(rewards) - episode_return = np.sum(rewards[rewards != None]) - episode_highest_reward = np.max(rewards) - env.viewer.close() - - del env - return episode_return, episode_highest_reward - - -def test_env(): - try: - env = make_sim_env('sim_insertion') - box_pos = np.concatenate(sample_insertion_pose()) - env.reset(box_pos) - while True: pass - except KeyboardInterrupt: - del env - print("stop") - - -if __name__ == '__main__': - test_env() - # io_utils = IOUtils() - # config = io_utils.load_config() - # eval_bc(config) - - - - -# config===== {'onscreen_render': False, -# 'eval': 1, -# 'ckpt_dir': 'ckpt_models', -# 'num_epochs': 3000, -# 'temporal_agg': False, -# 'policy_class': 'ACT', -# 'backbone': 'resnet18', -# 'seed': 0, 'real_robot': 0, -# 'task_name': 'sim_insertion', -# 'images_render_height': 480, -# 'images_render_width': 640, -# 'left_arm_DOF_number': 6, -# 'right_arm_DOF_number': 6, -# 'left_qpos_raw': 8, -# 'right_qpos_raw': 8, -# 'left_qvel_raw': 8, -# 'right_qvel_raw': 8, -# 'dataset_dir': '/home/arm/lzd/act_env/dataset/sim_insertion', -# 'num_episodes': 7, -# 'episode_len': 400, -# 'camera_names': ['top'], -# 'xml_dir': None, -# 'batch_size': 8, -# 'state_dim': 14, -# 'action_dim': 14, -# 'lr_backbone': 1e-05, -# 'enc_layers': 4, -# 'dec_layers': 7, -# 'nheads': 8, -# 'qpos_noise_std': 0, -# 'DT': 0.02, -# 'lr': 1e-05, -# 'kl_weight': 10, -# 'chunk_size': 100, -# 'hidden_dim': 512, -# 'dim_feedforward': 3200, -# 'policy_config': {'lr': 1e-05, 'num_queries': 100, 'kl_weight': 10, 'hidden_dim': 512, 'dim_feedforward': 3200, 'lr_backbone': 1e-05, 'backbone': 'resnet18', 'enc_layers': 4, 'dec_layers': 7, 'nheads': 8, 'camera_names': ['top']}} \ No newline at end of file diff --git a/roboimi/demos/training.py b/roboimi/demos/training.py deleted file mode 100644 index 858960b..0000000 --- a/roboimi/demos/training.py +++ /dev/null @@ -1,179 +0,0 @@ -import torch -import os -from tqdm import tqdm -import numpy as np -from copy import deepcopy -from itertools import repeat -import matplotlib.pyplot as plt -import time -from roboimi.utils.utils import set_seed, compute_dict_mean, detach_dict, load_data -from roboimi.utils.io_utils import IOUtils -from roboimi.utils.model_interface import ModelInterface -import matplotlib.pyplot as plt - -def train_bc(config): - num_epochs = config['num_epochs'] - ckpt_dir = config['ckpt_dir'] - seed = config['seed'] - - os.makedirs(ckpt_dir, exist_ok=True) - - set_seed(seed) - - model_interface = ModelInterface(config) - model_interface.setup() - - policy = model_interface.make_policy() - policy.cuda() - optimizer = model_interface.make_optimizer(policy) - # print("cam names=====",config['camera_names']) - train_dataloader, val_dataloader, stats, _ = load_data( - config['dataset_dir'], - config['num_episodes'], - config['camera_names'], - config['batch_size'], - config['batch_size']) - - IOUtils.save_stats(ckpt_dir, stats) - - train_history = [] - validation_history = [] - min_val_loss = np.inf - min_train_loss = np.inf - best_ckpt_info = None - - plt.ion() - fig, ax = plt.subplots() - train_losses, val_losses = [], [] - train_line, = ax.plot([], [], label='Train Loss') - val_line, = ax.plot([], [], label='Validation Loss') - ax.autoscale_view() - ax.set_xlabel('Epoch') - ax.set_ylabel('Loss') - ax.legend() - ax.grid(True) - - - train_annotation = ax.annotate('', xy=(0, 0), textcoords='offset points') - val_annotation = ax.annotate('', xy=(0, 0), textcoords='offset points') - - - min_train_text = ax.text(0.85, 0.5, '', transform=ax.transAxes, fontsize=10, verticalalignment='center', horizontalalignment='left', bbox=dict(facecolor='white', alpha=0.5)) - min_val_text = ax.text(0.85, 0.45, '', transform=ax.transAxes, fontsize=10, verticalalignment='center', horizontalalignment='left', bbox=dict(facecolor='white', alpha=0.5)) - - for epoch in tqdm(range(num_epochs)): - print(f'\nEpoch {epoch}') - - # Validation - epoch_val_loss, epoch_summary = validate(policy, val_dataloader) - validation_history.append(epoch_summary) - val_losses.append(epoch_val_loss.cpu().item()) - - if epoch_val_loss < min_val_loss: - min_val_loss = epoch_val_loss - min_val_epoch = epoch - best_ckpt_info = (epoch, min_val_loss, - deepcopy(policy.state_dict())) - - print(f'Val loss: {epoch_val_loss:.5f}') - print_summary(epoch_summary) - - # Training - epoch_train_loss, epoch_summary = train_epoch( - policy, optimizer, train_dataloader) - train_history.append(epoch_summary) - train_losses.append(epoch_train_loss.cpu().item()) - - if epoch_train_loss < min_train_loss: - min_train_loss = epoch_train_loss - min_train_epoch = epoch - - print(f'Train loss: {epoch_train_loss:.5f}') - print_summary(epoch_summary) - - # Update the plot with the new data - train_line.set_xdata(range(len(train_losses))) - train_line.set_ydata(train_losses) - val_line.set_xdata(range(len(val_losses))) - val_line.set_ydata(val_losses) - - # Update annotations with the latest loss values at their respective positions - train_annotation.set_position((len(train_losses)-1, train_losses[-1])) - train_annotation.xy = (len(train_losses)-1, train_losses[-1]) - train_annotation.set_text(f'{train_losses[-1]:.5f}') - - val_annotation.set_position((len(val_losses)-1, val_losses[-1])) - val_annotation.xy = (len(val_losses)-1, val_losses[-1]) - val_annotation.set_text(f'{val_losses[-1]:.5f}') - - # Update text objects with the minimum loss values, fixed on the right side - min_train_text.set_text(f'Min Train Loss: {min_train_loss:.5f} (Epoch {min_train_epoch})') - min_val_text.set_text(f'Min Val Loss: {min_val_loss:.5f} (Epoch {min_val_epoch})') - - ax.relim() - ax.autoscale_view() - plt.draw() - plt.pause(0.1) - - - plt.ioff() - IOUtils.save_checkpoint(policy, 'last', ckpt_dir, seed, 'last') - - best_epoch, min_val_loss, best_state_dict = best_ckpt_info - IOUtils.save_checkpoint(best_state_dict, best_epoch, - ckpt_dir, seed, 'best', min_val_loss) - print( - f'Training finished:\nSeed {seed}, val loss {min_val_loss:.6f} at epoch {best_epoch}') - - IOUtils.plot_history(train_history, validation_history, - num_epochs, ckpt_dir, seed) - - return best_ckpt_info - - - - - - -def validate(policy, dataloader): - policy.eval() - epoch_dicts = [] - with torch.inference_mode(): - for data in dataloader: - forward_dict = forward_pass(data, policy) - epoch_dicts.append(forward_dict) - epoch_summary = compute_dict_mean(epoch_dicts) - return epoch_summary['loss'], epoch_summary - - -def train_epoch(policy, optimizer, dataloader): - policy.train() - epoch_dicts = [] - for data in dataloader: - optimizer.zero_grad() - forward_dict = forward_pass(data, policy) - loss = forward_dict['loss'] - loss.backward() - optimizer.step() - epoch_dicts.append(detach_dict(forward_dict)) - epoch_summary = compute_dict_mean(epoch_dicts) - return epoch_summary['loss'], epoch_summary - - -def forward_pass(data, policy): - image_data, qpos_data, action_data, is_pad = data - image_data, qpos_data, action_data, is_pad = image_data.cuda( - ), qpos_data.cuda(), action_data.cuda(), is_pad.cuda() - return policy(qpos_data, image_data, action_data, is_pad) - - -def print_summary(summary): - summary_string = ' '.join( - [f'{k}: {v.item():.3f}' for k, v in summary.items()]) - print(summary_string) - - -if __name__ == '__main__': - io_utils = IOUtils() - config = io_utils.load_config() - train_bc(config) diff --git a/roboimi/detr/LICENSE b/roboimi/detr/LICENSE deleted file mode 100644 index b1395e9..0000000 --- a/roboimi/detr/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright 2020 - present, Facebook, Inc - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/roboimi/detr/README.md b/roboimi/detr/README.md deleted file mode 100644 index 500b1b8..0000000 --- a/roboimi/detr/README.md +++ /dev/null @@ -1,9 +0,0 @@ -This part of the codebase is modified from DETR https://github.com/facebookresearch/detr under APACHE 2.0. - - @article{Carion2020EndtoEndOD, - title={End-to-End Object Detection with Transformers}, - author={Nicolas Carion and Francisco Massa and Gabriel Synnaeve and Nicolas Usunier and Alexander Kirillov and Sergey Zagoruyko}, - journal={ArXiv}, - year={2020}, - volume={abs/2005.12872} - } \ No newline at end of file diff --git a/roboimi/detr/main.py b/roboimi/detr/main.py deleted file mode 100644 index 4891049..0000000 --- a/roboimi/detr/main.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -import argparse -from pathlib import Path - -import numpy as np -import torch -from .models import build_ACT_model, build_CNNMLP_model - - -def get_args_parser(): - parser = argparse.ArgumentParser('Set transformer detector', add_help=False) - parser.add_argument('--lr', default=1e-4, type=float) # will be overridden - parser.add_argument('--lr_backbone', default=1e-5, type=float) # will be overridden - parser.add_argument('--batch_size', default=2, type=int) # not used - parser.add_argument('--weight_decay', default=1e-4, type=float) - parser.add_argument('--epochs', default=300, type=int) # not used - parser.add_argument('--lr_drop', default=200, type=int) # not used - parser.add_argument('--clip_max_norm', default=0.1, type=float, # not used - help='gradient clipping max norm') - parser.add_argument('--qpos_noise_std', action='store', default=0, type=float, help='lr', required=False) - - # Model parameters - # * Backbone - parser.add_argument('--backbone', default='resnet18', type=str, # will be overridden - help="Name of the convolutional backbone to use") - parser.add_argument('--dilation', action='store_true', - help="If true, we replace stride with dilation in the last convolutional block (DC5)") - parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'), - help="Type of positional embedding to use on top of the image features") - parser.add_argument('--camera_names', default=[], type=list, # will be overridden - help="A list of camera names") - - # * Transformer - parser.add_argument('--enc_layers', default=4, type=int, # will be overridden - help="Number of encoding layers in the transformer") - parser.add_argument('--dec_layers', default=6, type=int, # will be overridden - help="Number of decoding layers in the transformer") - parser.add_argument('--dim_feedforward', default=2048, type=int, # will be overridden - help="Intermediate size of the feedforward layers in the transformer blocks") - parser.add_argument('--hidden_dim', default=256, type=int, # will be overridden - help="Size of the embeddings (dimension of the transformer)") - parser.add_argument('--dropout', default=0.1, type=float, - help="Dropout applied in the transformer") - parser.add_argument('--nheads', default=8, type=int, # will be overridden - help="Number of attention heads inside the transformer's attentions") - parser.add_argument('--num_queries', default=400, type=int, # will be overridden - help="Number of query slots") - parser.add_argument('--pre_norm', action='store_true') - parser.add_argument('--state_dim', default=14, type=int) - parser.add_argument('--action_dim', default=14, type=int) - - - # * Segmentation - parser.add_argument('--masks', action='store_true', - help="Train segmentation head if the flag is provided") - - - - return parser - - -def build_ACT_model_and_optimizer(args_override): - parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()]) - args = parser.parse_args() - - for k, v in args_override.items(): - setattr(args, k, v) - - model = build_ACT_model(args) - model.cuda() - - param_dicts = [ - {"params": [p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad]}, - { - "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad], - "lr": args.lr_backbone, - }, - ] - optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, - weight_decay=args.weight_decay) - - return model, optimizer - - -def build_CNNMLP_model_and_optimizer(args_override): - parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()]) - args = parser.parse_args() - - for k, v in args_override.items(): - setattr(args, k, v) - - model = build_CNNMLP_model(args) - model.cuda() - - param_dicts = [ - {"params": [p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad]}, - { - "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad], - "lr": args.lr_backbone, - }, - ] - optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, - weight_decay=args.weight_decay) - - return model, optimizer - diff --git a/roboimi/detr/models/__init__.py b/roboimi/detr/models/__init__.py deleted file mode 100644 index cc78db1..0000000 --- a/roboimi/detr/models/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -from .detr_vae import build as build_vae -from .detr_vae import build_cnnmlp as build_cnnmlp - -def build_ACT_model(args): - return build_vae(args) - -def build_CNNMLP_model(args): - return build_cnnmlp(args) \ No newline at end of file diff --git a/roboimi/detr/models/backbone.py b/roboimi/detr/models/backbone.py deleted file mode 100644 index 759bfb5..0000000 --- a/roboimi/detr/models/backbone.py +++ /dev/null @@ -1,168 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -""" -Backbone modules. -""" -from collections import OrderedDict - -import torch -import torch.nn.functional as F -import torchvision -from torch import nn -from torchvision.models._utils import IntermediateLayerGetter -from typing import Dict, List - -from util.misc import NestedTensor, is_main_process - -from .position_encoding import build_position_encoding - -class FrozenBatchNorm2d(torch.nn.Module): - """ - BatchNorm2d where the batch statistics and the affine parameters are fixed. - - Copy-paste from torchvision.misc.ops with added eps before rqsrt, - without which any other policy_models than torchvision.policy_models.resnet[18,34,50,101] - produce nans. - """ - - def __init__(self, n): - super(FrozenBatchNorm2d, self).__init__() - self.register_buffer("weight", torch.ones(n)) - self.register_buffer("bias", torch.zeros(n)) - self.register_buffer("running_mean", torch.zeros(n)) - self.register_buffer("running_var", torch.ones(n)) - - def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, - missing_keys, unexpected_keys, error_msgs): - num_batches_tracked_key = prefix + 'num_batches_tracked' - if num_batches_tracked_key in state_dict: - del state_dict[num_batches_tracked_key] - - super(FrozenBatchNorm2d, self)._load_from_state_dict( - state_dict, prefix, local_metadata, strict, - missing_keys, unexpected_keys, error_msgs) - - def forward(self, x): - # move reshapes to the beginning - # to make it fuser-friendly - w = self.weight.reshape(1, -1, 1, 1) - b = self.bias.reshape(1, -1, 1, 1) - rv = self.running_var.reshape(1, -1, 1, 1) - rm = self.running_mean.reshape(1, -1, 1, 1) - eps = 1e-5 - scale = w * (rv + eps).rsqrt() - bias = b - rm * scale - return x * scale + bias - - -class BackboneBase(nn.Module): - - def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool): - super().__init__() - # for name, parameter in backbone.named_parameters(): # only train later layers # TODO do we want this? - # if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: - # parameter.requires_grad_(False) - if return_interm_layers: - return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"} - else: - return_layers = {'layer4': "0"} - self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) - self.num_channels = num_channels - - def forward(self, tensor): - xs = self.body(tensor) - return xs - # out: Dict[str, NestedTensor] = {} - # for name, x in xs.items(): - # m = tensor_list.mask - # assert m is not None - # mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] - # out[name] = NestedTensor(x, mask) - # return out - - -class Backbone(BackboneBase): - """ResNet backbone with frozen BatchNorm.""" - def __init__(self, name: str, - train_backbone: bool, - return_interm_layers: bool, - dilation: bool): - backbone = getattr(torchvision.models, name)( - replace_stride_with_dilation=[False, False, dilation], - pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d) # pretrained # TODO do we want frozen batch_norm?? - num_channels = 512 if name in ('resnet18', 'resnet34') else 2048 - super().__init__(backbone, train_backbone, num_channels, return_interm_layers) - - -# class DINOv2BackBone(nn.Module): -# def __init__(self) -> None: -# super().__init__() -# self.body = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14') -# self.body.eval() -# self.num_channels = 384 - -# @torch.no_grad() -# def forward(self, tensor): -# xs = self.body.forward_features(tensor)["x_norm_patchtokens"] -# od = OrderedDict() -# od["0"] = xs.reshape(xs.shape[0], 22, 16, 384).permute(0, 3, 2, 1) -# return od - -class DINOv2BackBone(nn.Module): - def __init__(self, return_interm_layers: bool = False) -> None: - super().__init__() - self.body = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14') - self.body.eval() - self.num_channels = 384 - self.return_interm_layers = return_interm_layers - - @torch.no_grad() - def forward(self, tensor): - features = self.body.forward_features(tensor) - - if self.return_interm_layers: - - layer1 = features["x_norm_patchtokens"] - layer2 = features["x_norm_patchtokens"] - layer3 = features["x_norm_patchtokens"] - layer4 = features["x_norm_patchtokens"] - - od = OrderedDict() - od["0"] = layer1.reshape(layer1.shape[0], 22, 16, 384).permute(0, 3, 2, 1) - od["1"] = layer2.reshape(layer2.shape[0], 22, 16, 384).permute(0, 3, 2, 1) - od["2"] = layer3.reshape(layer3.shape[0], 22, 16, 384).permute(0, 3, 2, 1) - od["3"] = layer4.reshape(layer4.shape[0], 22, 16, 384).permute(0, 3, 2, 1) - return od - else: - xs = features["x_norm_patchtokens"] - od = OrderedDict() - od["0"] = xs.reshape(xs.shape[0], 22, 16, 384).permute(0, 3, 2, 1) - return od - -class Joiner(nn.Sequential): - def __init__(self, backbone, position_embedding): - super().__init__(backbone, position_embedding) - - def forward(self, tensor_list: NestedTensor): - xs = self[0](tensor_list) - out: List[NestedTensor] = [] - pos = [] - for name, x in xs.items(): - out.append(x) - # position encoding - pos.append(self[1](x).to(x.dtype)) - - return out, pos - - -def build_backbone(args): - position_embedding = build_position_encoding(args) - train_backbone = args.lr_backbone > 0 - return_interm_layers = args.masks - if args.backbone == 'dino_v2': - backbone = DINOv2BackBone() - else: - assert args.backbone in ['resnet18', 'resnet34'] - backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation) - model = Joiner(backbone, position_embedding) - model.num_channels = backbone.num_channels - return model diff --git a/roboimi/detr/models/detr_vae.py b/roboimi/detr/models/detr_vae.py deleted file mode 100644 index afcdc5d..0000000 --- a/roboimi/detr/models/detr_vae.py +++ /dev/null @@ -1,300 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -""" -DETR model and criterion classes. -""" -import torch -from torch import nn -from torch.autograd import Variable -from .backbone import build_backbone -from .transformer import build_transformer, TransformerEncoder, TransformerEncoderLayer - -import numpy as np - - -def reparametrize(mu, logvar): - std = logvar.div(2).exp() - eps = Variable(std.data.new(std.size()).normal_()) - return mu + std * eps - - -def get_sinusoid_encoding_table(n_position, d_hid): - def get_position_angle_vec(position): - return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] - - sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)]) - sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i - sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 - - return torch.FloatTensor(sinusoid_table).unsqueeze(0) - - -class DETRVAE(nn.Module): - """ This is the DETR module that performs object detection """ - def __init__(self, backbones, transformer, encoder, state_dim, action_dim, num_queries, camera_names): - """ Initializes the model. - Parameters: - backbones: torch module of the backbone to be used. See backbone.py - transformer: torch module of the transformer architecture. See transformer.py - state_dim: robot state dimension of the environment - num_queries: number of object queries, ie detection slot. This is the maximal number of objects - DETR can detect in a single image. For COCO, we recommend 100 queries. - aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. - """ - super().__init__() - self.num_queries = num_queries - self.camera_names = camera_names - self.transformer = transformer - self.encoder = encoder - hidden_dim = transformer.d_model - self.action_head = nn.Linear(hidden_dim, action_dim) - self.is_pad_head = nn.Linear(hidden_dim, 1) - self.query_embed = nn.Embedding(num_queries, hidden_dim) - if backbones is not None: - self.input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1) - self.backbones = nn.ModuleList(backbones) - self.input_proj_robot_state = nn.Linear(state_dim, hidden_dim) - else: - raise NotImplementedError - # input_dim = 14 + 7 # robot_state + env_state - # self.input_proj_robot_state = nn.Linear(state_dim, hidden_dim) - # self.input_proj_env_state = nn.Linear(7, hidden_dim) - # self.pos = torch.nn.Embedding(2, hidden_dim) - # self.backbones = None - - # encoder extra parameters - self.latent_dim = 32 # final size of latent z # TODO tune - self.cls_embed = nn.Embedding(1, hidden_dim) # extra cls token embedding - self.encoder_action_proj = nn.Linear(action_dim, hidden_dim) # project action to embedding - self.encoder_joint_proj = nn.Linear(state_dim, hidden_dim) # project qpos to embedding - self.latent_proj = nn.Linear(hidden_dim, self.latent_dim*2) # project hidden state to latent std, var - self.register_buffer('pos_table', get_sinusoid_encoding_table(1+1+num_queries, hidden_dim)) # [CLS], qpos, a_seq - - # decoder extra parameters - self.latent_out_proj = nn.Linear(self.latent_dim, hidden_dim) # project latent sample to embedding - self.additional_pos_embed = nn.Embedding(2, hidden_dim) # learned position embedding for proprio and latent - - def forward(self, qpos, image, env_state, actions=None, is_pad=None): - """ - qpos: batch, qpos_dim - image: batch, num_cam, channel, height, width - env_state: None - actions: batch, seq, action_dim - """ - is_training = actions is not None # train or val - bs, _ = qpos.shape - ### Obtain latent z from action sequence - if is_training: - # project action sequence to embedding dim, and concat with a CLS token - action_embed = self.encoder_action_proj(actions) # (bs, seq, hidden_dim) - qpos_embed = self.encoder_joint_proj(qpos) # (bs, hidden_dim) - qpos_embed = torch.unsqueeze(qpos_embed, axis=1) # (bs, 1, hidden_dim) - cls_embed = self.cls_embed.weight # (1, hidden_dim) - cls_embed = torch.unsqueeze(cls_embed, axis=0).repeat(bs, 1, 1) # (bs, 1, hidden_dim) - encoder_input = torch.cat([cls_embed, qpos_embed, action_embed], axis=1) # (bs, seq+1, hidden_dim) - encoder_input = encoder_input.permute(1, 0, 2) # (seq+1, bs, hidden_dim) - # do not mask cls token - cls_joint_is_pad = torch.full((bs, 2), False).to(qpos.device) # False: not a padding - is_pad = torch.cat([cls_joint_is_pad, is_pad], axis=1) # (bs, seq+1) - # obtain position embedding - pos_embed = self.pos_table.clone().detach() - pos_embed = pos_embed.permute(1, 0, 2) # (seq+1, 1, hidden_dim) - # query model - encoder_output = self.encoder(encoder_input, pos=pos_embed, src_key_padding_mask=is_pad) - encoder_output = encoder_output[0] # take cls output only - latent_info = self.latent_proj(encoder_output) - mu = latent_info[:, :self.latent_dim] - logvar = latent_info[:, self.latent_dim:] - latent_sample = reparametrize(mu, logvar) - latent_input = self.latent_out_proj(latent_sample) - else: - mu = logvar = None - latent_sample = torch.zeros([bs, self.latent_dim], dtype=torch.float32).to(qpos.device) - latent_input = self.latent_out_proj(latent_sample) - - if self.backbones is not None: - # Image observation features and position embeddings - all_cam_features = [] - all_cam_pos = [] - - - - - # print(f"Image shape: {image.shape}, Number of cameras: {len(self.camera_names)}") - - - for cam_id, cam_name in enumerate(self.camera_names): - # features, pos = self.backbones[0](image[:, cam_id]) # HARDCODED - features, pos = self.backbones[cam_id](image[:, cam_id]) - features = features[0] # take the last layer feature - pos = pos[0] - all_cam_features.append(self.input_proj(features)) - all_cam_pos.append(pos) - - - - - - - - - - - - # proprioception features - proprio_input = self.input_proj_robot_state(qpos) - # fold camera dimension into width dimension - src = torch.cat(all_cam_features, axis=3) - pos = torch.cat(all_cam_pos, axis=3) - hs = self.transformer(src, None, self.query_embed.weight, pos, latent_input, proprio_input, self.additional_pos_embed.weight)[0] - else: - qpos = self.input_proj_robot_state(qpos) - env_state = self.input_proj_env_state(env_state) - transformer_input = torch.cat([qpos, env_state], axis=1) # seq length = 2 - hs = self.transformer(transformer_input, None, self.query_embed.weight, self.pos.weight)[0] - a_hat = self.action_head(hs) - is_pad_hat = self.is_pad_head(hs) - return a_hat, is_pad_hat, [mu, logvar] - - - -class CNNMLP(nn.Module): - def __init__(self, backbones, state_dim, camera_names): - """ Initializes the model. - Parameters: - backbones: torch module of the backbone to be used. See backbone.py - transformer: torch module of the transformer architecture. See transformer.py - state_dim: robot state dimension of the environment - num_queries: number of object queries, ie detection slot. This is the maximal number of objects - DETR can detect in a single image. For COCO, we recommend 100 queries. - aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. - """ - super().__init__() - self.camera_names = camera_names - self.action_head = nn.Linear(1000, state_dim) # TODO add more - if backbones is not None: - self.backbones = nn.ModuleList(backbones) - backbone_down_projs = [] - for backbone in backbones: - down_proj = nn.Sequential( - nn.Conv2d(backbone.num_channels, 128, kernel_size=5), - nn.Conv2d(128, 64, kernel_size=5), - nn.Conv2d(64, 32, kernel_size=5) - ) - backbone_down_projs.append(down_proj) - self.backbone_down_projs = nn.ModuleList(backbone_down_projs) - - mlp_in_dim = 768 * len(backbones) + 14 - self.mlp = mlp(input_dim=mlp_in_dim, hidden_dim=1024, output_dim=14, hidden_depth=2) - else: - raise NotImplementedError - - def forward(self, qpos, image, env_state, actions=None): - """ - qpos: batch, qpos_dim - image: batch, num_cam, channel, height, width - env_state: None - actions: batch, seq, action_dim - """ - is_training = actions is not None # train or val - bs, _ = qpos.shape - # Image observation features and position embeddings - all_cam_features = [] - for cam_id, cam_name in enumerate(self.camera_names): - features, pos = self.backbones[cam_id](image[:, cam_id]) - features = features[0] # take the last layer feature - pos = pos[0] # not used - all_cam_features.append(self.backbone_down_projs[cam_id](features)) - # flatten everything - flattened_features = [] - for cam_feature in all_cam_features: - flattened_features.append(cam_feature.reshape([bs, -1])) - flattened_features = torch.cat(flattened_features, axis=1) # 768 each - features = torch.cat([flattened_features, qpos], axis=1) # qpos: 14 - a_hat = self.mlp(features) - return a_hat - - -def mlp(input_dim, hidden_dim, output_dim, hidden_depth): - if hidden_depth == 0: - mods = [nn.Linear(input_dim, output_dim)] - else: - mods = [nn.Linear(input_dim, hidden_dim), nn.ReLU(inplace=True)] - for i in range(hidden_depth - 1): - mods += [nn.Linear(hidden_dim, hidden_dim), nn.ReLU(inplace=True)] - mods.append(nn.Linear(hidden_dim, output_dim)) - trunk = nn.Sequential(*mods) - return trunk - - -def build_encoder(args): - d_model = args.hidden_dim # 256 - dropout = args.dropout # 0.1 - nhead = args.nheads # 8 - dim_feedforward = args.dim_feedforward # 2048 - num_encoder_layers = args.enc_layers # 4 # TODO shared with VAE decoder - normalize_before = args.pre_norm # False - activation = "relu" - - encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, - dropout, activation, normalize_before) - encoder_norm = nn.LayerNorm(d_model) if normalize_before else None - encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) - - return encoder - - -def build(args): - state_dim = args.state_dim - action_dim = args.action_dim - - # From state - # backbone = None # from state for now, no need for conv nets - # From image - backbones = [] - # backbone = build_backbone(args) - # backbones.append(backbone) - for _ in args.camera_names: - backbone = build_backbone(args) - backbones.append(backbone) - - transformer = build_transformer(args) - - encoder = build_encoder(args) - - model = DETRVAE( - backbones, - transformer, - encoder, - state_dim=state_dim, - action_dim=action_dim, - num_queries=args.num_queries, - camera_names=args.camera_names, - ) - - n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) - print("number of parameters: %.2fM" % (n_parameters/1e6,)) - - return model - -def build_cnnmlp(args): - state_dim = 14 # TODO hardcode - - # From state - # backbone = None # from state for now, no need for conv nets - # From image - backbones = [] - for _ in args.camera_names: - backbone = build_backbone(args) - backbones.append(backbone) - - model = CNNMLP( - backbones, - state_dim=state_dim, - camera_names=args.camera_names, - ) - - n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) - print("number of parameters: %.2fM" % (n_parameters/1e6,)) - - return model - diff --git a/roboimi/detr/models/position_encoding.py b/roboimi/detr/models/position_encoding.py deleted file mode 100644 index c75733e..0000000 --- a/roboimi/detr/models/position_encoding.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -""" -Various positional encodings for the transformer. -""" -import math -import torch -from torch import nn - -from util.misc import NestedTensor - - -class PositionEmbeddingSine(nn.Module): - """ - This is a more standard version of the position embedding, very similar to the one - used by the Attention is all you need paper, generalized to work on images. - """ - def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): - super().__init__() - self.num_pos_feats = num_pos_feats - self.temperature = temperature - self.normalize = normalize - if scale is not None and normalize is False: - raise ValueError("normalize should be True if scale is passed") - if scale is None: - scale = 2 * math.pi - self.scale = scale - - def forward(self, tensor): - x = tensor - # mask = tensor_list.mask - # assert mask is not None - # not_mask = ~mask - - not_mask = torch.ones_like(x[0, [0]]) - y_embed = not_mask.cumsum(1, dtype=torch.float32) - x_embed = not_mask.cumsum(2, dtype=torch.float32) - if self.normalize: - eps = 1e-6 - y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale - x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale - - dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) - dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) - - pos_x = x_embed[:, :, :, None] / dim_t - pos_y = y_embed[:, :, :, None] / dim_t - pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) - pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) - pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) - return pos - - -class PositionEmbeddingLearned(nn.Module): - """ - Absolute pos embedding, learned. - """ - def __init__(self, num_pos_feats=256): - super().__init__() - self.row_embed = nn.Embedding(50, num_pos_feats) - self.col_embed = nn.Embedding(50, num_pos_feats) - self.reset_parameters() - - def reset_parameters(self): - nn.init.uniform_(self.row_embed.weight) - nn.init.uniform_(self.col_embed.weight) - - def forward(self, tensor_list: NestedTensor): - x = tensor_list.tensors - h, w = x.shape[-2:] - i = torch.arange(w, device=x.device) - j = torch.arange(h, device=x.device) - x_emb = self.col_embed(i) - y_emb = self.row_embed(j) - pos = torch.cat([ - x_emb.unsqueeze(0).repeat(h, 1, 1), - y_emb.unsqueeze(1).repeat(1, w, 1), - ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) - return pos - - -def build_position_encoding(args): - N_steps = args.hidden_dim // 2 - if args.position_embedding in ('v2', 'sine'): - # TODO find a better way of exposing other arguments - position_embedding = PositionEmbeddingSine(N_steps, normalize=True) - elif args.position_embedding in ('v3', 'learned'): - position_embedding = PositionEmbeddingLearned(N_steps) - else: - raise ValueError(f"not supported {args.position_embedding}") - - return position_embedding diff --git a/roboimi/detr/models/transformer.py b/roboimi/detr/models/transformer.py deleted file mode 100644 index 2306ab2..0000000 --- a/roboimi/detr/models/transformer.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -""" -DETR Transformer class. - -Copy-paste from torch.nn.Transformer with modifications: - * positional encodings are passed in MHattention - * extra LN at the end of encoder is removed - * decoder returns a stack of activations from all decoding layers -""" -import copy -from typing import Optional, List - -import torch -import torch.nn.functional as F -from torch import nn, Tensor - - -class Transformer(nn.Module): - - def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, - num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, - activation="relu", normalize_before=False, - return_intermediate_dec=False): - super().__init__() - - encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, - dropout, activation, normalize_before) - encoder_norm = nn.LayerNorm(d_model) if normalize_before else None - self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) - - decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, - dropout, activation, normalize_before) - decoder_norm = nn.LayerNorm(d_model) - self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm, - return_intermediate=return_intermediate_dec) - - self._reset_parameters() - - self.d_model = d_model - self.nhead = nhead - - def _reset_parameters(self): - for p in self.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p) - - def forward(self, src, mask, query_embed, pos_embed, latent_input=None, proprio_input=None, additional_pos_embed=None): - # TODO flatten only when input has H and W - if len(src.shape) == 4: # has H and W - # flatten NxCxHxW to HWxNxC - bs, c, h, w = src.shape - src = src.flatten(2).permute(2, 0, 1) - pos_embed = pos_embed.flatten(2).permute(2, 0, 1).repeat(1, bs, 1) - query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) - # mask = mask.flatten(1) - - additional_pos_embed = additional_pos_embed.unsqueeze(1).repeat(1, bs, 1) # seq, bs, dim - pos_embed = torch.cat([additional_pos_embed, pos_embed], axis=0) - - addition_input = torch.stack([latent_input, proprio_input], axis=0) - src = torch.cat([addition_input, src], axis=0) - else: - assert len(src.shape) == 3 - # flatten NxHWxC to HWxNxC - bs, hw, c = src.shape - src = src.permute(1, 0, 2) - pos_embed = pos_embed.unsqueeze(1).repeat(1, bs, 1) - query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) - - tgt = torch.zeros_like(query_embed) - memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) - hs = self.decoder(tgt, memory, memory_key_padding_mask=mask, - pos=pos_embed, query_pos=query_embed) - hs = hs.transpose(1, 2) - return hs - -class TransformerEncoder(nn.Module): - - def __init__(self, encoder_layer, num_layers, norm=None): - super().__init__() - self.layers = _get_clones(encoder_layer, num_layers) - self.num_layers = num_layers - self.norm = norm - - def forward(self, src, - mask: Optional[Tensor] = None, - src_key_padding_mask: Optional[Tensor] = None, - pos: Optional[Tensor] = None): - output = src - - for layer in self.layers: - output = layer(output, src_mask=mask, - src_key_padding_mask=src_key_padding_mask, pos=pos) - - if self.norm is not None: - output = self.norm(output) - - return output - - -class TransformerDecoder(nn.Module): - - def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): - super().__init__() - self.layers = _get_clones(decoder_layer, num_layers) - self.num_layers = num_layers - self.norm = norm - self.return_intermediate = return_intermediate - - def forward(self, tgt, memory, - tgt_mask: Optional[Tensor] = None, - memory_mask: Optional[Tensor] = None, - tgt_key_padding_mask: Optional[Tensor] = None, - memory_key_padding_mask: Optional[Tensor] = None, - pos: Optional[Tensor] = None, - query_pos: Optional[Tensor] = None): - output = tgt - - intermediate = [] - - for layer in self.layers: - output = layer(output, memory, tgt_mask=tgt_mask, - memory_mask=memory_mask, - tgt_key_padding_mask=tgt_key_padding_mask, - memory_key_padding_mask=memory_key_padding_mask, - pos=pos, query_pos=query_pos) - if self.return_intermediate: - intermediate.append(self.norm(output)) - - if self.norm is not None: - output = self.norm(output) - if self.return_intermediate: - intermediate.pop() - intermediate.append(output) - - if self.return_intermediate: - return torch.stack(intermediate) - - return output.unsqueeze(0) - - -class TransformerEncoderLayer(nn.Module): - - def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, - activation="relu", normalize_before=False): - super().__init__() - self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) - # Implementation of Feedforward model - self.linear1 = nn.Linear(d_model, dim_feedforward) - self.dropout = nn.Dropout(dropout) - self.linear2 = nn.Linear(dim_feedforward, d_model) - - self.norm1 = nn.LayerNorm(d_model) - self.norm2 = nn.LayerNorm(d_model) - self.dropout1 = nn.Dropout(dropout) - self.dropout2 = nn.Dropout(dropout) - - self.activation = _get_activation_fn(activation) - self.normalize_before = normalize_before - - def with_pos_embed(self, tensor, pos: Optional[Tensor]): - return tensor if pos is None else tensor + pos - - def forward_post(self, - src, - src_mask: Optional[Tensor] = None, - src_key_padding_mask: Optional[Tensor] = None, - pos: Optional[Tensor] = None): - q = k = self.with_pos_embed(src, pos) - src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, - key_padding_mask=src_key_padding_mask)[0] - src = src + self.dropout1(src2) - src = self.norm1(src) - src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) - src = src + self.dropout2(src2) - src = self.norm2(src) - return src - - def forward_pre(self, src, - src_mask: Optional[Tensor] = None, - src_key_padding_mask: Optional[Tensor] = None, - pos: Optional[Tensor] = None): - src2 = self.norm1(src) - q = k = self.with_pos_embed(src2, pos) - src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask, - key_padding_mask=src_key_padding_mask)[0] - src = src + self.dropout1(src2) - src2 = self.norm2(src) - src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) - src = src + self.dropout2(src2) - return src - - def forward(self, src, - src_mask: Optional[Tensor] = None, - src_key_padding_mask: Optional[Tensor] = None, - pos: Optional[Tensor] = None): - if self.normalize_before: - return self.forward_pre(src, src_mask, src_key_padding_mask, pos) - return self.forward_post(src, src_mask, src_key_padding_mask, pos) - - -class TransformerDecoderLayer(nn.Module): - - def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, - activation="relu", normalize_before=False): - super().__init__() - self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) - self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) - # Implementation of Feedforward model - self.linear1 = nn.Linear(d_model, dim_feedforward) - self.dropout = nn.Dropout(dropout) - self.linear2 = nn.Linear(dim_feedforward, d_model) - - self.norm1 = nn.LayerNorm(d_model) - self.norm2 = nn.LayerNorm(d_model) - self.norm3 = nn.LayerNorm(d_model) - self.dropout1 = nn.Dropout(dropout) - self.dropout2 = nn.Dropout(dropout) - self.dropout3 = nn.Dropout(dropout) - - self.activation = _get_activation_fn(activation) - self.normalize_before = normalize_before - - def with_pos_embed(self, tensor, pos: Optional[Tensor]): - return tensor if pos is None else tensor + pos - - def forward_post(self, tgt, memory, - tgt_mask: Optional[Tensor] = None, - memory_mask: Optional[Tensor] = None, - tgt_key_padding_mask: Optional[Tensor] = None, - memory_key_padding_mask: Optional[Tensor] = None, - pos: Optional[Tensor] = None, - query_pos: Optional[Tensor] = None): - q = k = self.with_pos_embed(tgt, query_pos) - tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, - key_padding_mask=tgt_key_padding_mask)[0] - tgt = tgt + self.dropout1(tgt2) - tgt = self.norm1(tgt) - tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), - key=self.with_pos_embed(memory, pos), - value=memory, attn_mask=memory_mask, - key_padding_mask=memory_key_padding_mask)[0] - tgt = tgt + self.dropout2(tgt2) - tgt = self.norm2(tgt) - tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) - tgt = tgt + self.dropout3(tgt2) - tgt = self.norm3(tgt) - return tgt - - def forward_pre(self, tgt, memory, - tgt_mask: Optional[Tensor] = None, - memory_mask: Optional[Tensor] = None, - tgt_key_padding_mask: Optional[Tensor] = None, - memory_key_padding_mask: Optional[Tensor] = None, - pos: Optional[Tensor] = None, - query_pos: Optional[Tensor] = None): - tgt2 = self.norm1(tgt) - q = k = self.with_pos_embed(tgt2, query_pos) - tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, - key_padding_mask=tgt_key_padding_mask)[0] - tgt = tgt + self.dropout1(tgt2) - tgt2 = self.norm2(tgt) - tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), - key=self.with_pos_embed(memory, pos), - value=memory, attn_mask=memory_mask, - key_padding_mask=memory_key_padding_mask)[0] - tgt = tgt + self.dropout2(tgt2) - tgt2 = self.norm3(tgt) - tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) - tgt = tgt + self.dropout3(tgt2) - return tgt - - def forward(self, tgt, memory, - tgt_mask: Optional[Tensor] = None, - memory_mask: Optional[Tensor] = None, - tgt_key_padding_mask: Optional[Tensor] = None, - memory_key_padding_mask: Optional[Tensor] = None, - pos: Optional[Tensor] = None, - query_pos: Optional[Tensor] = None): - if self.normalize_before: - return self.forward_pre(tgt, memory, tgt_mask, memory_mask, - tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) - return self.forward_post(tgt, memory, tgt_mask, memory_mask, - tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) - - -def _get_clones(module, N): - return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) - - -def build_transformer(args): - return Transformer( - d_model=args.hidden_dim, - dropout=args.dropout, - nhead=args.nheads, - dim_feedforward=args.dim_feedforward, - num_encoder_layers=args.enc_layers, - num_decoder_layers=args.dec_layers, - normalize_before=args.pre_norm, - return_intermediate_dec=True, - ) - - -def _get_activation_fn(activation): - """Return an activation function given a string""" - if activation == "relu": - return F.relu - if activation == "gelu": - return F.gelu - if activation == "glu": - return F.glu - raise RuntimeError(F"activation should be relu/gelu, not {activation}.") diff --git a/roboimi/detr/policy.py b/roboimi/detr/policy.py deleted file mode 100644 index 20ac4c0..0000000 --- a/roboimi/detr/policy.py +++ /dev/null @@ -1,163 +0,0 @@ -import torch.nn as nn -from torch.nn import functional as F -import torchvision.transforms as transforms -from torchvision.transforms import v2 -import torch -from roboimi.detr.main import build_ACT_model_and_optimizer, build_CNNMLP_model_and_optimizer - - -class ACTPolicy(nn.Module): - def __init__(self, args_override): - super().__init__() - model, optimizer = build_ACT_model_and_optimizer(args_override) - self.model = model # CVAE decoder - self.optimizer = optimizer - self.kl_weight = args_override['kl_weight'] - print(f'KL Weight {self.kl_weight}') - - def __call__(self, qpos, image, actions=None, is_pad=None): - env_state = None - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - image = normalize(image) - if actions is not None: # training time - actions = actions[:, :self.model.num_queries] - is_pad = is_pad[:, :self.model.num_queries] - - a_hat, is_pad_hat, (mu, logvar) = self.model(qpos, image, env_state, actions, is_pad) - total_kld, dim_wise_kld, mean_kld = kl_divergence(mu, logvar) - loss_dict = dict() - all_l1 = F.l1_loss(actions, a_hat, reduction='none') - l1 = (all_l1 * ~is_pad.unsqueeze(-1)).mean() - loss_dict['l1'] = l1 - loss_dict['kl'] = total_kld[0] - loss_dict['loss'] = loss_dict['l1'] + loss_dict['kl'] * self.kl_weight - return loss_dict - else: # inference time - a_hat, _, (_, _) = self.model(qpos, image, env_state) # no action, sample from prior - return a_hat - - def configure_optimizers(self): - return self.optimizer - -class ACTTVPolicy(nn.Module): - def __init__(self, args_override): - super().__init__() - model, optimizer = build_ACT_model_and_optimizer(args_override) - self.model = model # CVAE decoder - self.optimizer = optimizer - self.kl_weight = args_override['kl_weight'] - self.qpos_noise_std = args_override['qpos_noise_std'] - print(f'KL Weight {self.kl_weight}') - - def __call__(self, qpos, image, actions=None, is_pad=None): - env_state = None - - - - - - - - - - # normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - # std=[0.229, 0.224, 0.225]) - # image = normalize(image) - - - patch_h = 16 - patch_w = 22 - if actions is not None: - transform = v2.Compose([ - v2.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5), - v2.RandomPerspective(distortion_scale=0.5), - v2.RandomAffine(degrees=10, translate=(0.1,0.1), scale=(0.9,1.1)), - v2.GaussianBlur(kernel_size=(9,9), sigma=(0.1,2.0)), - v2.Resize((patch_h * 14, patch_w * 14)), - # v2.CenterCrop((patch_h * 14, patch_w * 14)), - v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), - ]) - qpos += (self.qpos_noise_std**0.5)*torch.randn_like(qpos) - else: # inference time - transform = v2.Compose([ - v2.Resize((patch_h * 14, patch_w * 14)), - # v2.CenterCrop((patch_h * 14, patch_w * 14)), - v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) - ]) - - image = transform(image) - - - - - - - - - - - - - if actions is not None: # training time - actions = actions[:, :self.model.num_queries] - is_pad = is_pad[:, :self.model.num_queries] - - a_hat, is_pad_hat, (mu, logvar) = self.model(qpos, image, env_state, actions, is_pad) - total_kld, dim_wise_kld, mean_kld = kl_divergence(mu, logvar) - loss_dict = dict() - all_l1 = F.l1_loss(actions, a_hat, reduction='none') - l1 = (all_l1 * ~is_pad.unsqueeze(-1)).mean() - loss_dict['l1'] = l1 - loss_dict['kl'] = total_kld[0] - loss_dict['loss'] = loss_dict['l1'] + loss_dict['kl'] * self.kl_weight - return loss_dict - else: # inference time - a_hat, _, (_, _) = self.model(qpos, image, env_state) # no action, sample from prior - return a_hat - - def configure_optimizers(self): - return self.optimizer - - -class CNNMLPPolicy(nn.Module): - def __init__(self, args_override): - super().__init__() - model, optimizer = build_CNNMLP_model_and_optimizer(args_override) - self.model = model # decoder - self.optimizer = optimizer - - def __call__(self, qpos, image, actions=None, is_pad=None): - env_state = None # TODO - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - image = normalize(image) - if actions is not None: # training time - actions = actions[:, 0] - a_hat = self.model(qpos, image, env_state, actions) - mse = F.mse_loss(actions, a_hat) - loss_dict = dict() - loss_dict['mse'] = mse - loss_dict['loss'] = loss_dict['mse'] - return loss_dict - else: # inference time - a_hat = self.model(qpos, image, env_state) # no action, sample from prior - return a_hat - - def configure_optimizers(self): - return self.optimizer - -def kl_divergence(mu, logvar): - batch_size = mu.size(0) - assert batch_size != 0 - if mu.data.ndimension() == 4: - mu = mu.view(mu.size(0), mu.size(1)) - if logvar.data.ndimension() == 4: - logvar = logvar.view(logvar.size(0), logvar.size(1)) - - klds = -0.5 * (1 + logvar - mu.pow(2) - logvar.exp()) - total_kld = klds.sum(1).mean(0, True) - dimension_wise_kld = klds.mean(0) - mean_kld = klds.mean(1).mean(0, True) - - return total_kld, dimension_wise_kld, mean_kld diff --git a/roboimi/detr/setup.py b/roboimi/detr/setup.py deleted file mode 100644 index 55d18c0..0000000 --- a/roboimi/detr/setup.py +++ /dev/null @@ -1,10 +0,0 @@ -from distutils.core import setup -from setuptools import find_packages - -setup( - name='detr', - version='0.0.0', - packages=find_packages(), - license='MIT License', - long_description=open('README.md').read(), -) \ No newline at end of file diff --git a/roboimi/detr/util/__init__.py b/roboimi/detr/util/__init__.py deleted file mode 100644 index 168f997..0000000 --- a/roboimi/detr/util/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved diff --git a/roboimi/detr/util/box_ops.py b/roboimi/detr/util/box_ops.py deleted file mode 100644 index 9c088e5..0000000 --- a/roboimi/detr/util/box_ops.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -""" -Utilities for bounding box manipulation and GIoU. -""" -import torch -from torchvision.ops.boxes import box_area - - -def box_cxcywh_to_xyxy(x): - x_c, y_c, w, h = x.unbind(-1) - b = [(x_c - 0.5 * w), (y_c - 0.5 * h), - (x_c + 0.5 * w), (y_c + 0.5 * h)] - return torch.stack(b, dim=-1) - - -def box_xyxy_to_cxcywh(x): - x0, y0, x1, y1 = x.unbind(-1) - b = [(x0 + x1) / 2, (y0 + y1) / 2, - (x1 - x0), (y1 - y0)] - return torch.stack(b, dim=-1) - - -# modified from torchvision to also return the union -def box_iou(boxes1, boxes2): - area1 = box_area(boxes1) - area2 = box_area(boxes2) - - lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] - rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] - - wh = (rb - lt).clamp(min=0) # [N,M,2] - inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] - - union = area1[:, None] + area2 - inter - - iou = inter / union - return iou, union - - -def generalized_box_iou(boxes1, boxes2): - """ - Generalized IoU from https://giou.stanford.edu/ - - The boxes should be in [x0, y0, x1, y1] format - - Returns a [N, M] pairwise matrix, where N = len(boxes1) - and M = len(boxes2) - """ - # degenerate boxes gives inf / nan results - # so do an early check - assert (boxes1[:, 2:] >= boxes1[:, :2]).all() - assert (boxes2[:, 2:] >= boxes2[:, :2]).all() - iou, union = box_iou(boxes1, boxes2) - - lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) - rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) - - wh = (rb - lt).clamp(min=0) # [N,M,2] - area = wh[:, :, 0] * wh[:, :, 1] - - return iou - (area - union) / area - - -def masks_to_boxes(masks): - """Compute the bounding boxes around the provided masks - - The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. - - Returns a [N, 4] tensors, with the boxes in xyxy format - """ - if masks.numel() == 0: - return torch.zeros((0, 4), device=masks.device) - - h, w = masks.shape[-2:] - - y = torch.arange(0, h, dtype=torch.float) - x = torch.arange(0, w, dtype=torch.float) - y, x = torch.meshgrid(y, x) - - x_mask = (masks * x.unsqueeze(0)) - x_max = x_mask.flatten(1).max(-1)[0] - x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] - - y_mask = (masks * y.unsqueeze(0)) - y_max = y_mask.flatten(1).max(-1)[0] - y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] - - return torch.stack([x_min, y_min, x_max, y_max], 1) diff --git a/roboimi/detr/util/misc.py b/roboimi/detr/util/misc.py deleted file mode 100644 index dfa9fb5..0000000 --- a/roboimi/detr/util/misc.py +++ /dev/null @@ -1,468 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -""" -Misc functions, including distributed helpers. - -Mostly copy-paste from torchvision references. -""" -import os -import subprocess -import time -from collections import defaultdict, deque -import datetime -import pickle -from packaging import version -from typing import Optional, List - -import torch -import torch.distributed as dist -from torch import Tensor - -# needed due to empty tensor bug in pytorch and torchvision 0.5 -import torchvision -if version.parse(torchvision.__version__) < version.parse('0.7'): - from torchvision.ops import _new_empty_tensor - from torchvision.ops.misc import _output_size - - -class SmoothedValue(object): - """Track a series of values and provide access to smoothed values over a - window or the global series average. - """ - - def __init__(self, window_size=20, fmt=None): - if fmt is None: - fmt = "{median:.4f} ({global_avg:.4f})" - self.deque = deque(maxlen=window_size) - self.total = 0.0 - self.count = 0 - self.fmt = fmt - - def update(self, value, n=1): - self.deque.append(value) - self.count += n - self.total += value * n - - def synchronize_between_processes(self): - """ - Warning: does not synchronize the deque! - """ - if not is_dist_avail_and_initialized(): - return - t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') - dist.barrier() - dist.all_reduce(t) - t = t.tolist() - self.count = int(t[0]) - self.total = t[1] - - @property - def median(self): - d = torch.tensor(list(self.deque)) - return d.median().item() - - @property - def avg(self): - d = torch.tensor(list(self.deque), dtype=torch.float32) - return d.mean().item() - - @property - def global_avg(self): - return self.total / self.count - - @property - def max(self): - return max(self.deque) - - @property - def value(self): - return self.deque[-1] - - def __str__(self): - return self.fmt.format( - median=self.median, - avg=self.avg, - global_avg=self.global_avg, - max=self.max, - value=self.value) - - -def all_gather(data): - """ - Run all_gather on arbitrary picklable data (not necessarily tensors) - Args: - data: any picklable object - Returns: - list[data]: list of data gathered from each rank - """ - world_size = get_world_size() - if world_size == 1: - return [data] - - # serialized to a Tensor - buffer = pickle.dumps(data) - storage = torch.ByteStorage.from_buffer(buffer) - tensor = torch.ByteTensor(storage).to("cuda") - - # obtain Tensor size of each rank - local_size = torch.tensor([tensor.numel()], device="cuda") - size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] - dist.all_gather(size_list, local_size) - size_list = [int(size.item()) for size in size_list] - max_size = max(size_list) - - # receiving Tensor from all ranks - # we pad the tensor because torch all_gather does not support - # gathering tensors of different shapes - tensor_list = [] - for _ in size_list: - tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) - if local_size != max_size: - padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") - tensor = torch.cat((tensor, padding), dim=0) - dist.all_gather(tensor_list, tensor) - - data_list = [] - for size, tensor in zip(size_list, tensor_list): - buffer = tensor.cpu().numpy().tobytes()[:size] - data_list.append(pickle.loads(buffer)) - - return data_list - - -def reduce_dict(input_dict, average=True): - """ - Args: - input_dict (dict): all the values will be reduced - average (bool): whether to do average or sum - Reduce the values in the dictionary from all processes so that all processes - have the averaged results. Returns a dict with the same fields as - input_dict, after reduction. - """ - world_size = get_world_size() - if world_size < 2: - return input_dict - with torch.no_grad(): - names = [] - values = [] - # sort the keys so that they are consistent across processes - for k in sorted(input_dict.keys()): - names.append(k) - values.append(input_dict[k]) - values = torch.stack(values, dim=0) - dist.all_reduce(values) - if average: - values /= world_size - reduced_dict = {k: v for k, v in zip(names, values)} - return reduced_dict - - -class MetricLogger(object): - def __init__(self, delimiter="\t"): - self.meters = defaultdict(SmoothedValue) - self.delimiter = delimiter - - def update(self, **kwargs): - for k, v in kwargs.items(): - if isinstance(v, torch.Tensor): - v = v.item() - assert isinstance(v, (float, int)) - self.meters[k].update(v) - - def __getattr__(self, attr): - if attr in self.meters: - return self.meters[attr] - if attr in self.__dict__: - return self.__dict__[attr] - raise AttributeError("'{}' object has no attribute '{}'".format( - type(self).__name__, attr)) - - def __str__(self): - loss_str = [] - for name, meter in self.meters.items(): - loss_str.append( - "{}: {}".format(name, str(meter)) - ) - return self.delimiter.join(loss_str) - - def synchronize_between_processes(self): - for meter in self.meters.values(): - meter.synchronize_between_processes() - - def add_meter(self, name, meter): - self.meters[name] = meter - - def log_every(self, iterable, print_freq, header=None): - i = 0 - if not header: - header = '' - start_time = time.time() - end = time.time() - iter_time = SmoothedValue(fmt='{avg:.4f}') - data_time = SmoothedValue(fmt='{avg:.4f}') - space_fmt = ':' + str(len(str(len(iterable)))) + 'd' - if torch.cuda.is_available(): - log_msg = self.delimiter.join([ - header, - '[{0' + space_fmt + '}/{1}]', - 'eta: {eta}', - '{meters}', - 'time: {time}', - 'data: {data}', - 'max mem: {memory:.0f}' - ]) - else: - log_msg = self.delimiter.join([ - header, - '[{0' + space_fmt + '}/{1}]', - 'eta: {eta}', - '{meters}', - 'time: {time}', - 'data: {data}' - ]) - MB = 1024.0 * 1024.0 - for obj in iterable: - data_time.update(time.time() - end) - yield obj - iter_time.update(time.time() - end) - if i % print_freq == 0 or i == len(iterable) - 1: - eta_seconds = iter_time.global_avg * (len(iterable) - i) - eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) - if torch.cuda.is_available(): - print(log_msg.format( - i, len(iterable), eta=eta_string, - meters=str(self), - time=str(iter_time), data=str(data_time), - memory=torch.cuda.max_memory_allocated() / MB)) - else: - print(log_msg.format( - i, len(iterable), eta=eta_string, - meters=str(self), - time=str(iter_time), data=str(data_time))) - i += 1 - end = time.time() - total_time = time.time() - start_time - total_time_str = str(datetime.timedelta(seconds=int(total_time))) - print('{} Total time: {} ({:.4f} s / it)'.format( - header, total_time_str, total_time / len(iterable))) - - -def get_sha(): - cwd = os.path.dirname(os.path.abspath(__file__)) - - def _run(command): - return subprocess.check_output(command, cwd=cwd).decode('ascii').strip() - sha = 'N/A' - diff = "clean" - branch = 'N/A' - try: - sha = _run(['git', 'rev-parse', 'HEAD']) - subprocess.check_output(['git', 'diff'], cwd=cwd) - diff = _run(['git', 'diff-index', 'HEAD']) - diff = "has uncommited changes" if diff else "clean" - branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD']) - except Exception: - pass - message = f"sha: {sha}, status: {diff}, branch: {branch}" - return message - - -def collate_fn(batch): - batch = list(zip(*batch)) - batch[0] = nested_tensor_from_tensor_list(batch[0]) - return tuple(batch) - - -def _max_by_axis(the_list): - # type: (List[List[int]]) -> List[int] - maxes = the_list[0] - for sublist in the_list[1:]: - for index, item in enumerate(sublist): - maxes[index] = max(maxes[index], item) - return maxes - - -class NestedTensor(object): - def __init__(self, tensors, mask: Optional[Tensor]): - self.tensors = tensors - self.mask = mask - - def to(self, device): - # type: (Device) -> NestedTensor # noqa - cast_tensor = self.tensors.to(device) - mask = self.mask - if mask is not None: - assert mask is not None - cast_mask = mask.to(device) - else: - cast_mask = None - return NestedTensor(cast_tensor, cast_mask) - - def decompose(self): - return self.tensors, self.mask - - def __repr__(self): - return str(self.tensors) - - -def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): - # TODO make this more general - if tensor_list[0].ndim == 3: - if torchvision._is_tracing(): - # nested_tensor_from_tensor_list() does not export well to ONNX - # call _onnx_nested_tensor_from_tensor_list() instead - return _onnx_nested_tensor_from_tensor_list(tensor_list) - - # TODO make it support different-sized images - max_size = _max_by_axis([list(img.shape) for img in tensor_list]) - # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) - batch_shape = [len(tensor_list)] + max_size - b, c, h, w = batch_shape - dtype = tensor_list[0].dtype - device = tensor_list[0].device - tensor = torch.zeros(batch_shape, dtype=dtype, device=device) - mask = torch.ones((b, h, w), dtype=torch.bool, device=device) - for img, pad_img, m in zip(tensor_list, tensor, mask): - pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) - m[: img.shape[1], :img.shape[2]] = False - else: - raise ValueError('not supported') - return NestedTensor(tensor, mask) - - -# _onnx_nested_tensor_from_tensor_list() is an implementation of -# nested_tensor_from_tensor_list() that is supported by ONNX tracing. -@torch.jit.unused -def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: - max_size = [] - for i in range(tensor_list[0].dim()): - max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64) - max_size.append(max_size_i) - max_size = tuple(max_size) - - # work around for - # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) - # m[: img.shape[1], :img.shape[2]] = False - # which is not yet supported in onnx - padded_imgs = [] - padded_masks = [] - for img in tensor_list: - padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] - padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) - padded_imgs.append(padded_img) - - m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) - padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) - padded_masks.append(padded_mask.to(torch.bool)) - - tensor = torch.stack(padded_imgs) - mask = torch.stack(padded_masks) - - return NestedTensor(tensor, mask=mask) - - -def setup_for_distributed(is_master): - """ - This function disables printing when not in master process - """ - import builtins as __builtin__ - builtin_print = __builtin__.print - - def print(*args, **kwargs): - force = kwargs.pop('force', False) - if is_master or force: - builtin_print(*args, **kwargs) - - __builtin__.print = print - - -def is_dist_avail_and_initialized(): - if not dist.is_available(): - return False - if not dist.is_initialized(): - return False - return True - - -def get_world_size(): - if not is_dist_avail_and_initialized(): - return 1 - return dist.get_world_size() - - -def get_rank(): - if not is_dist_avail_and_initialized(): - return 0 - return dist.get_rank() - - -def is_main_process(): - return get_rank() == 0 - - -def save_on_master(*args, **kwargs): - if is_main_process(): - torch.save(*args, **kwargs) - - -def init_distributed_mode(args): - if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: - args.rank = int(os.environ["RANK"]) - args.world_size = int(os.environ['WORLD_SIZE']) - args.gpu = int(os.environ['LOCAL_RANK']) - elif 'SLURM_PROCID' in os.environ: - args.rank = int(os.environ['SLURM_PROCID']) - args.gpu = args.rank % torch.cuda.device_count() - else: - print('Not using distributed mode') - args.distributed = False - return - - args.distributed = True - - torch.cuda.set_device(args.gpu) - args.dist_backend = 'nccl' - print('| distributed init (rank {}): {}'.format( - args.rank, args.dist_url), flush=True) - torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - torch.distributed.barrier() - setup_for_distributed(args.rank == 0) - - -@torch.no_grad() -def accuracy(output, target, topk=(1,)): - """Computes the precision@k for the specified values of k""" - if target.numel() == 0: - return [torch.zeros([], device=output.device)] - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].view(-1).float().sum(0) - res.append(correct_k.mul_(100.0 / batch_size)) - return res - - -def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None): - # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor - """ - Equivalent to nn.functional.interpolate, but with support for empty batch sizes. - This will eventually be supported natively by PyTorch, and this - class can go away. - """ - if version.parse(torchvision.__version__) < version.parse('0.7'): - if input.numel() > 0: - return torch.nn.functional.interpolate( - input, size, scale_factor, mode, align_corners - ) - - output_shape = _output_size(2, input, size, scale_factor) - output_shape = list(input.shape[:-2]) + list(output_shape) - return _new_empty_tensor(input, output_shape) - else: - return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners) diff --git a/roboimi/detr/util/plot_utils.py b/roboimi/detr/util/plot_utils.py deleted file mode 100644 index 0f24bed..0000000 --- a/roboimi/detr/util/plot_utils.py +++ /dev/null @@ -1,107 +0,0 @@ -""" -Plotting utilities to visualize training logs. -""" -import torch -import pandas as pd -import numpy as np -import seaborn as sns -import matplotlib.pyplot as plt - -from pathlib import Path, PurePath - - -def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'): - ''' - Function to plot specific fields from training log(s). Plots both training and test results. - - :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file - - fields = which results to plot from each log file - plots both training and test for each field. - - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots - - log_name = optional, name of log file if different than default 'log.txt'. - - :: Outputs - matplotlib plots of results in fields, color coded for each log file. - - solid lines are training results, dashed lines are test results. - - ''' - func_name = "plot_utils.py::plot_logs" - - # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path, - # convert single Path to list to avoid 'not iterable' error - - if not isinstance(logs, list): - if isinstance(logs, PurePath): - logs = [logs] - print(f"{func_name} info: logs param expects a list argument, converted to list[Path].") - else: - raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \ - Expect list[Path] or single Path obj, received {type(logs)}") - - # Quality checks - verify valid dir(s), that every item in list is Path object, and that log_name exists in each dir - for i, dir in enumerate(logs): - if not isinstance(dir, PurePath): - raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}") - if not dir.exists(): - raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}") - # verify log_name exists - fn = Path(dir / log_name) - if not fn.exists(): - print(f"-> missing {log_name}. Have you gotten to Epoch 1 in training?") - print(f"--> full path of missing log file: {fn}") - return - - # load log file(s) and plot - dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs] - - fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5)) - - for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))): - for j, field in enumerate(fields): - if field == 'mAP': - coco_eval = pd.DataFrame( - np.stack(df.test_coco_eval_bbox.dropna().values)[:, 1] - ).ewm(com=ewm_col).mean() - axs[j].plot(coco_eval, c=color) - else: - df.interpolate().ewm(com=ewm_col).mean().plot( - y=[f'train_{field}', f'test_{field}'], - ax=axs[j], - color=[color] * 2, - style=['-', '--'] - ) - for ax, field in zip(axs, fields): - ax.legend([Path(p).name for p in logs]) - ax.set_title(field) - - -def plot_precision_recall(files, naming_scheme='iter'): - if naming_scheme == 'exp_id': - # name becomes exp_id - names = [f.parts[-3] for f in files] - elif naming_scheme == 'iter': - names = [f.stem for f in files] - else: - raise ValueError(f'not supported {naming_scheme}') - fig, axs = plt.subplots(ncols=2, figsize=(16, 5)) - for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names): - data = torch.load(f) - # precision is n_iou, n_points, n_cat, n_area, max_det - precision = data['precision'] - recall = data['params'].recThrs - scores = data['scores'] - # take precision for all classes, all areas and 100 detections - precision = precision[0, :, :, 0, -1].mean(1) - scores = scores[0, :, :, 0, -1].mean(1) - prec = precision.mean() - rec = data['recall'][0, :, 0, -1].mean() - print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' + - f'score={scores.mean():0.3f}, ' + - f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}' - ) - axs[0].plot(recall, precision, c=color) - axs[1].plot(recall, scores, c=color) - - axs[0].set_title('Precision / Recall') - axs[0].legend(names) - axs[1].set_title('Scores / Recall') - axs[1].legend(names) - return fig, axs diff --git a/roboimi/gr00t/main.py b/roboimi/gr00t/main.py deleted file mode 100644 index c56b359..0000000 --- a/roboimi/gr00t/main.py +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -""" -GR00T (diffusion-based DiT policy) model builder. - -This module provides functions to build GR00T models and optimizers -from configuration dictionaries (typically from config.yaml's 'gr00t:' section). -""" -import argparse -from pathlib import Path - -import numpy as np -import torch -from .models import build_gr00t_model - - -def get_args_parser(): - """ - Create argument parser for GR00T model configuration. - - All parameters can be overridden via args_override dictionary in - build_gr00t_model_and_optimizer(). This allows loading from config.yaml. - """ - parser = argparse.ArgumentParser('GR00T training and evaluation script', add_help=False) - - # Training parameters - parser.add_argument('--lr', default=1e-5, type=float, - help='Learning rate for main parameters') - parser.add_argument('--lr_backbone', default=1e-5, type=float, - help='Learning rate for backbone parameters') - parser.add_argument('--weight_decay', default=1e-4, type=float, - help='Weight decay for optimizer') - - # GR00T model architecture parameters - parser.add_argument('--embed_dim', default=1536, type=int, - help='Embedding dimension for transformer') - parser.add_argument('--hidden_dim', default=1024, type=int, - help='Hidden dimension for MLP layers') - parser.add_argument('--state_dim', default=16, type=int, - help='State (qpos) dimension') - parser.add_argument('--action_dim', default=16, type=int, - help='Action dimension') - parser.add_argument('--num_queries', default=16, type=int, - help='Number of action queries (chunk size)') - - # DiT (Diffusion Transformer) parameters - parser.add_argument('--num_layers', default=16, type=int, - help='Number of transformer layers') - parser.add_argument('--nheads', default=32, type=int, - help='Number of attention heads') - parser.add_argument('--mlp_ratio', default=4, type=float, - help='MLP hidden dimension ratio') - parser.add_argument('--dropout', default=0.2, type=float, - help='Dropout rate') - - # Backbone parameters - parser.add_argument('--backbone', default='dino_v2', type=str, - help='Backbone architecture (dino_v2, resnet18, resnet34)') - parser.add_argument('--position_embedding', default='sine', type=str, - choices=('sine', 'learned'), - help='Type of positional encoding') - - # Camera configuration - parser.add_argument('--camera_names', default=[], nargs='+', - help='List of camera names for observations') - - # Other parameters (not directly used but kept for compatibility) - parser.add_argument('--batch_size', default=15, type=int) - parser.add_argument('--epochs', default=20000, type=int) - parser.add_argument('--masks', action='store_true', - help='Use intermediate layer features') - parser.add_argument('--dilation', action='store_false', - help='Use dilated convolution in backbone') - - return parser - - -def build_gr00t_model_and_optimizer(args_override): - """ - Build GR00T model and optimizer from config dictionary. - - This function is designed to work with config.yaml loading: - 1. Parse default arguments - 2. Override with values from args_override (typically from config['gr00t']) - 3. Build model and optimizer - - Args: - args_override: Dictionary of config values, typically from config.yaml's 'gr00t:' section - Expected keys: embed_dim, hidden_dim, state_dim, action_dim, - num_queries, nheads, mlp_ratio, dropout, num_layers, - lr, lr_backbone, camera_names, backbone, etc. - - Returns: - model: GR00T model on CUDA - optimizer: AdamW optimizer with separate learning rates for backbone and other params - """ - parser = argparse.ArgumentParser('GR00T training and evaluation script', - parents=[get_args_parser()]) - args = parser.parse_args() - - # Override with config values - for k, v in args_override.items(): - setattr(args, k, v) - - # Build model - model = build_gr00t_model(args) - model.cuda() - - # Create parameter groups with different learning rates - param_dicts = [ - { - "params": [p for n, p in model.named_parameters() - if "backbone" not in n and p.requires_grad] - }, - { - "params": [p for n, p in model.named_parameters() - if "backbone" in n and p.requires_grad], - "lr": args.lr_backbone, - }, - ] - - optimizer = torch.optim.AdamW(param_dicts, - lr=args.lr, - weight_decay=args.weight_decay) - - return model, optimizer diff --git a/roboimi/gr00t/models/__init__.py b/roboimi/gr00t/models/__init__.py deleted file mode 100644 index 327396a..0000000 --- a/roboimi/gr00t/models/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .gr00t import build_gr00t_model - -__all__ = ['build_gr00t_model'] diff --git a/roboimi/gr00t/models/backbone.py b/roboimi/gr00t/models/backbone.py deleted file mode 100644 index 759bfb5..0000000 --- a/roboimi/gr00t/models/backbone.py +++ /dev/null @@ -1,168 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -""" -Backbone modules. -""" -from collections import OrderedDict - -import torch -import torch.nn.functional as F -import torchvision -from torch import nn -from torchvision.models._utils import IntermediateLayerGetter -from typing import Dict, List - -from util.misc import NestedTensor, is_main_process - -from .position_encoding import build_position_encoding - -class FrozenBatchNorm2d(torch.nn.Module): - """ - BatchNorm2d where the batch statistics and the affine parameters are fixed. - - Copy-paste from torchvision.misc.ops with added eps before rqsrt, - without which any other policy_models than torchvision.policy_models.resnet[18,34,50,101] - produce nans. - """ - - def __init__(self, n): - super(FrozenBatchNorm2d, self).__init__() - self.register_buffer("weight", torch.ones(n)) - self.register_buffer("bias", torch.zeros(n)) - self.register_buffer("running_mean", torch.zeros(n)) - self.register_buffer("running_var", torch.ones(n)) - - def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, - missing_keys, unexpected_keys, error_msgs): - num_batches_tracked_key = prefix + 'num_batches_tracked' - if num_batches_tracked_key in state_dict: - del state_dict[num_batches_tracked_key] - - super(FrozenBatchNorm2d, self)._load_from_state_dict( - state_dict, prefix, local_metadata, strict, - missing_keys, unexpected_keys, error_msgs) - - def forward(self, x): - # move reshapes to the beginning - # to make it fuser-friendly - w = self.weight.reshape(1, -1, 1, 1) - b = self.bias.reshape(1, -1, 1, 1) - rv = self.running_var.reshape(1, -1, 1, 1) - rm = self.running_mean.reshape(1, -1, 1, 1) - eps = 1e-5 - scale = w * (rv + eps).rsqrt() - bias = b - rm * scale - return x * scale + bias - - -class BackboneBase(nn.Module): - - def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool): - super().__init__() - # for name, parameter in backbone.named_parameters(): # only train later layers # TODO do we want this? - # if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: - # parameter.requires_grad_(False) - if return_interm_layers: - return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"} - else: - return_layers = {'layer4': "0"} - self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) - self.num_channels = num_channels - - def forward(self, tensor): - xs = self.body(tensor) - return xs - # out: Dict[str, NestedTensor] = {} - # for name, x in xs.items(): - # m = tensor_list.mask - # assert m is not None - # mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] - # out[name] = NestedTensor(x, mask) - # return out - - -class Backbone(BackboneBase): - """ResNet backbone with frozen BatchNorm.""" - def __init__(self, name: str, - train_backbone: bool, - return_interm_layers: bool, - dilation: bool): - backbone = getattr(torchvision.models, name)( - replace_stride_with_dilation=[False, False, dilation], - pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d) # pretrained # TODO do we want frozen batch_norm?? - num_channels = 512 if name in ('resnet18', 'resnet34') else 2048 - super().__init__(backbone, train_backbone, num_channels, return_interm_layers) - - -# class DINOv2BackBone(nn.Module): -# def __init__(self) -> None: -# super().__init__() -# self.body = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14') -# self.body.eval() -# self.num_channels = 384 - -# @torch.no_grad() -# def forward(self, tensor): -# xs = self.body.forward_features(tensor)["x_norm_patchtokens"] -# od = OrderedDict() -# od["0"] = xs.reshape(xs.shape[0], 22, 16, 384).permute(0, 3, 2, 1) -# return od - -class DINOv2BackBone(nn.Module): - def __init__(self, return_interm_layers: bool = False) -> None: - super().__init__() - self.body = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14') - self.body.eval() - self.num_channels = 384 - self.return_interm_layers = return_interm_layers - - @torch.no_grad() - def forward(self, tensor): - features = self.body.forward_features(tensor) - - if self.return_interm_layers: - - layer1 = features["x_norm_patchtokens"] - layer2 = features["x_norm_patchtokens"] - layer3 = features["x_norm_patchtokens"] - layer4 = features["x_norm_patchtokens"] - - od = OrderedDict() - od["0"] = layer1.reshape(layer1.shape[0], 22, 16, 384).permute(0, 3, 2, 1) - od["1"] = layer2.reshape(layer2.shape[0], 22, 16, 384).permute(0, 3, 2, 1) - od["2"] = layer3.reshape(layer3.shape[0], 22, 16, 384).permute(0, 3, 2, 1) - od["3"] = layer4.reshape(layer4.shape[0], 22, 16, 384).permute(0, 3, 2, 1) - return od - else: - xs = features["x_norm_patchtokens"] - od = OrderedDict() - od["0"] = xs.reshape(xs.shape[0], 22, 16, 384).permute(0, 3, 2, 1) - return od - -class Joiner(nn.Sequential): - def __init__(self, backbone, position_embedding): - super().__init__(backbone, position_embedding) - - def forward(self, tensor_list: NestedTensor): - xs = self[0](tensor_list) - out: List[NestedTensor] = [] - pos = [] - for name, x in xs.items(): - out.append(x) - # position encoding - pos.append(self[1](x).to(x.dtype)) - - return out, pos - - -def build_backbone(args): - position_embedding = build_position_encoding(args) - train_backbone = args.lr_backbone > 0 - return_interm_layers = args.masks - if args.backbone == 'dino_v2': - backbone = DINOv2BackBone() - else: - assert args.backbone in ['resnet18', 'resnet34'] - backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation) - model = Joiner(backbone, position_embedding) - model.num_channels = backbone.num_channels - return model diff --git a/roboimi/gr00t/models/dit.py b/roboimi/gr00t/models/dit.py deleted file mode 100644 index ad8cede..0000000 --- a/roboimi/gr00t/models/dit.py +++ /dev/null @@ -1,142 +0,0 @@ -from typing import Optional - -from diffusers import ConfigMixin, ModelMixin -from diffusers.configuration_utils import register_to_config -from diffusers.models.embeddings import SinusoidalPositionalEmbedding, TimestepEmbedding, Timesteps -import torch -from torch import nn -import torch.nn.functional as F - -class TimestepEncoder(nn.Module): - def __init__(self, args): - super().__init__() - embedding_dim = args.embed_dim - self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1) - self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim) - - def forward(self, timesteps): - dtype = next(self.parameters()).dtype - timesteps_proj = self.time_proj(timesteps).to(dtype) - timesteps_emb = self.timestep_embedder(timesteps_proj) # (N, D) - return timesteps_emb - - -class AdaLayerNorm(nn.Module): - def __init__(self, embedding_dim, norm_eps=1e-5, norm_elementwise_affine=False): - super().__init__() - - output_dim = embedding_dim * 2 - self.silu = nn.SiLU() - self.linear = nn.Linear(embedding_dim, output_dim) - self.norm = nn.LayerNorm(output_dim // 2, norm_eps, norm_elementwise_affine) - - def forward( - self, - x: torch.Tensor, - temb: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - temb = self.linear(self.silu(temb)) - scale, shift = temb.chunk(2, dim=1) - x = self.norm(x) * (1 + scale[:, None]) + shift[:, None] - return x - - -class BasicTransformerBlock(nn.Module): - def __init__(self, args, crosss_attention_dim, use_self_attn=False): - super().__init__() - dim = args.embed_dim - num_heads = args.nheads - mlp_ratio = args.mlp_ratio - dropout = args.dropout - self.norm1 = AdaLayerNorm(dim) - - if not use_self_attn: - self.attn = nn.MultiheadAttention( - embed_dim=dim, - num_heads=num_heads, - dropout=dropout, - kdim=crosss_attention_dim, - vdim=crosss_attention_dim, - batch_first=True, - ) - else: - self.attn = nn.MultiheadAttention( - embed_dim=dim, - num_heads=num_heads, - dropout=dropout, - batch_first=True, - ) - - self.norm2 = nn.LayerNorm(dim, eps=1e-5, elementwise_affine=False) - - self.mlp = nn.Sequential( - nn.Linear(dim, dim * mlp_ratio), - nn.GELU(), - nn.Dropout(dropout), - nn.Linear(dim * mlp_ratio, dim), - nn.Dropout(dropout) - ) - - def forward(self, hidden_states, temb, context=None): - norm_hidden_states = self.norm1(hidden_states, temb) - - attn_output = self.attn( - norm_hidden_states, - context if context is not None else norm_hidden_states, - context if context is not None else norm_hidden_states, - )[0] - - hidden_states = attn_output + hidden_states - - norm_hidden_states = self.norm2(hidden_states) - - ff_output = self.mlp(norm_hidden_states) - - hidden_states = ff_output + hidden_states - - return hidden_states - -class DiT(nn.Module): - def __init__(self, args, cross_attention_dim): - super().__init__() - inner_dim = args.embed_dim - num_layers = args.num_layers - output_dim = args.hidden_dim - - self.timestep_encoder = TimestepEncoder(args) - - all_blocks = [] - for idx in range(num_layers): - use_self_attn = idx % 2 == 1 - if use_self_attn: - block = BasicTransformerBlock(args, crosss_attention_dim=None, use_self_attn=True) - else: - block = BasicTransformerBlock(args, crosss_attention_dim=cross_attention_dim, use_self_attn=False) - all_blocks.append(block) - - self.transformer_blocks = nn.ModuleList(all_blocks) - - self.norm_out = nn.LayerNorm(inner_dim, eps=1e-6, elementwise_affine=False) - self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim) - self.proj_out_2 = nn.Linear(inner_dim, output_dim) - - def forward(self, hidden_states, timestep, encoder_hidden_states): - temb = self.timestep_encoder(timestep) - - hidden_states = hidden_states.contiguous() - encoder_hidden_states = encoder_hidden_states.contiguous() - - for idx, block in enumerate(self.transformer_blocks): - if idx % 2 == 1: - hidden_states = block(hidden_states, temb) - else: - hidden_states = block(hidden_states, temb, context=encoder_hidden_states) - - conditioning = temb - shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1) - hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None] - return self.proj_out_2(hidden_states) - - -def build_dit(args, cross_attention_dim): - return DiT(args, cross_attention_dim) \ No newline at end of file diff --git a/roboimi/gr00t/models/gr00t.py b/roboimi/gr00t/models/gr00t.py deleted file mode 100644 index 7ed9cb4..0000000 --- a/roboimi/gr00t/models/gr00t.py +++ /dev/null @@ -1,124 +0,0 @@ - -from .modules import ( - build_action_decoder, - build_action_encoder, - build_state_encoder, - build_time_sampler, - build_noise_scheduler, -) -from .backbone import build_backbone -from .dit import build_dit -import torch -import torch.nn as nn -import torch.nn.functional as F - -class gr00t(nn.Module): - def __init__( - self, - backbones, - dit, - state_encoder, - action_encoder, - action_decoder, - time_sampler, - noise_scheduler, - num_queries, - camera_names, - ): - super().__init__() - self.num_queries = num_queries - self.camera_names = camera_names - self.dit = dit - self.state_encoder = state_encoder - self.action_encoder = action_encoder - self.action_decoder = action_decoder - self.time_sampler = time_sampler - self.noise_scheduler = noise_scheduler - - if backbones is not None: - self.backbones = nn.ModuleList(backbones) - else: - raise NotImplementedError - - def forward(self, qpos, image, actions=None, is_pad=None): - is_training = actions is not None # train or val - bs, _ = qpos.shape - - all_cam_features = [] - for cam_id, cam_name in enumerate(self.camera_names): - # features, pos = self.backbones[0](image[:, cam_id]) # HARDCODED - features, pos = self.backbones[cam_id](image[:, cam_id]) - features = features[0] # take the last layer feature - B, C, H, W = features.shape - features_seq = features.permute(0, 2, 3, 1).reshape(B, H * W, C) - all_cam_features.append(features_seq) - encoder_hidden_states = torch.cat(all_cam_features, dim=1) - - state_features = self.state_encoder(qpos) # [B, 1, emb_dim] - - if is_training: - # training logic - - timesteps = self.time_sampler(bs, actions.device, actions.dtype) - noisy_actions, target_velocity = self.noise_scheduler.add_noise( - actions, timesteps - ) - t_discretized = (timesteps[:, 0, 0] * 1000).long() - action_features = self.action_encoder(noisy_actions, t_discretized) - sa_embs = torch.cat((state_features, action_features), dim=1) - model_output = self.dit(sa_embs, t_discretized, encoder_hidden_states) - pred = self.action_decoder(model_output) - pred_actions = pred[:, -actions.shape[1] :] - action_loss = F.mse_loss(pred_actions, target_velocity, reduction='none') - return pred_actions, action_loss - else: - actions = torch.randn(bs, self.num_queries, qpos.shape[-1], device=qpos.device, dtype=qpos.dtype) - k = 5 - dt = 1.0 / k - for t in range(k): - t_cont = t / float(k) - t_discretized = int(t_cont * 1000) - timesteps = torch.full((bs,), t_discretized, device=qpos.device, dtype=qpos.dtype) - action_features = self.action_encoder(actions, timesteps) - sa_embs = torch.cat((state_features, action_features), dim=1) - # Create tensor of shape [B] for DiT (consistent with training path) - model_output = self.dit(sa_embs, timesteps, encoder_hidden_states) - pred = self.action_decoder(model_output) - pred_velocity = pred[:, -self.num_queries :] - actions = actions + pred_velocity * dt - return actions, _ -def build_gr00t_model(args): - state_dim = args.state_dim - action_dim = args.action_dim - - backbones = [] - for _ in args.camera_names: - backbone = build_backbone(args) - backbones.append(backbone) - - cross_attention_dim = backbones[0].num_channels - - dit = build_dit(args, cross_attention_dim) - - state_encoder = build_state_encoder(args) - action_encoder = build_action_encoder(args) - action_decoder = build_action_decoder(args) - time_sampler = build_time_sampler(args) - noise_scheduler = build_noise_scheduler(args) - model = gr00t( - backbones, - dit, - state_encoder, - action_encoder, - action_decoder, - time_sampler, - noise_scheduler, - args.num_queries, - args.camera_names, - ) - - n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) - print("number of parameters: %.2fM" % (n_parameters/1e6,)) - return model - - diff --git a/roboimi/gr00t/models/modules.py b/roboimi/gr00t/models/modules.py deleted file mode 100644 index 727cee3..0000000 --- a/roboimi/gr00t/models/modules.py +++ /dev/null @@ -1,179 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - -# ActionEncoder -class SinusoidalPositionalEncoding(nn.Module): - def __init__(self, args): - super().__init__() - self.embed_dim = args.embed_dim - - def forward(self, timesteps): - timesteps = timesteps.float() - B, T = timesteps.shape - device = timesteps.device - - half_dim = self.embed_dim // 2 - - exponent = -torch.arange(half_dim, dtype=torch.float, device=device) * ( - torch.log(torch.tensor(10000.0)) / half_dim - ) - - freqs = timesteps.unsqueeze(-1) * exponent.exp() - - sin = torch.sin(freqs) - cos = torch.cos(freqs) - enc = torch.cat([sin, cos], dim=-1) # (B, T, w) - - return enc - - -class ActionEncoder(nn.Module): - def __init__(self, args): - super().__init__() - action_dim = args.action_dim - embed_dim = args.embed_dim - - self.W1 = nn.Linear(action_dim, embed_dim) - self.W2 = nn.Linear(2 * embed_dim, embed_dim) - self.W3 = nn.Linear(embed_dim, embed_dim) - - self.pos_encoder = SinusoidalPositionalEncoding(args) - - def forward(self, actions, timesteps): - B, T, _ = actions.shape - - # 1) Expand each batch's single scalar time 'tau' across all T steps - # so that shape => (B, T) - # Handle different input shapes: (B,), (B, 1), (B, 1, 1) - # Reshape to (B,) then expand to (B, T) - # if timesteps.dim() == 3: - # # Shape (B, 1, 1) or (B, T, 1) -> (B,) - # timesteps = timesteps[:, 0, 0] - # elif timesteps.dim() == 2: - # # Shape (B, 1) or (B, T) -> take first element if needed - # if timesteps.shape[1] == 1: - # timesteps = timesteps[:, 0] - # # else: already (B, T), use as is - # elif timesteps.dim() != 1: - # raise ValueError( - # f"Expected `timesteps` to have shape (B,), (B, 1), or (B, 1, 1), got {timesteps.shape}" - # ) - - # Now timesteps should be (B,), expand to (B, T) - if timesteps.dim() == 1 and timesteps.shape[0] == B: - timesteps = timesteps.unsqueeze(1).expand(-1, T) - else: - raise ValueError( - "Expected `timesteps` to have shape (B,) so we can replicate across T." - ) - - # 2) Standard action MLP step for shape => (B, T, w) - a_emb = self.W1(actions) - - # 3) Get the sinusoidal encoding (B, T, w) - tau_emb = self.pos_encoder(timesteps).to(dtype=a_emb.dtype) - - # 4) Concat along last dim => (B, T, 2w), then W2 => (B, T, w), swish - x = torch.cat([a_emb, tau_emb], dim=-1) - x = F.silu(self.W2(x)) - - # 5) Finally W3 => (B, T, w) - x = self.W3(x) - - return x - - -def build_action_encoder(args): - return ActionEncoder(args) - - -# StateEncoder -class StateEncoder(nn.Module): - def __init__(self, args): - super().__init__() - input_dim = args.state_dim - hidden_dim = args.hidden_dim - output_dim = args.embed_dim - - self.mlp = nn.Sequential( - nn.Linear(input_dim, hidden_dim), - nn.ReLU(), - nn.Linear(hidden_dim, output_dim), - ) - - def forward(self, states): - state_emb = self.mlp(states) # [B, emb_dim] - state_emb = state_emb.unsqueeze(1) - return state_emb # [B, 1, emb_dim] - - -def build_state_encoder(args): - return StateEncoder(args) - - -# ActionDecoder -class ActionDecoder(nn.Module): - def __init__(self,args): - super().__init__() - input_dim = args.hidden_dim - hidden_dim = args.hidden_dim - output_dim = args.action_dim - - self.num_queries = args.num_queries - - self.mlp = nn.Sequential( - nn.Linear(input_dim, hidden_dim), - nn.ReLU(), - nn.Linear(hidden_dim, output_dim), - ) - - def forward(self, model_output): - pred_actions = self.mlp(model_output) - return pred_actions[:, -self.num_queries:] - - -def build_action_decoder(args): - return ActionDecoder(args) - - -# TimeSampler -class TimeSampler(nn.Module): - def __init__(self, noise_s = 0.999, noise_beta_alpha=1.5, noise_beta_beta=1.0): - super().__init__() - self.noise_s = noise_s - self.beta_dist = torch.distributions.Beta(noise_beta_alpha, noise_beta_beta) - - def forward(self, batch_size, device, dtype): - sample = self.beta_dist.sample([batch_size]).to(device, dtype=dtype) - sample = (1 - sample) * self.noise_s - return sample[:, None, None] - - -def build_time_sampler(args): - return TimeSampler() - - -# NoiseScheduler -import torch -import torch.nn as nn - -class FlowMatchingScheduler(nn.Module): - def __init__(self): - super().__init__() - - # --- 训练逻辑:加噪并计算目标 --- - def add_noise(self, actions, timesteps): - noise = torch.randn_like(actions) - noisy_samples = actions * timesteps + noise * (1 - timesteps) - target_velocity = actions - noise - - return noisy_samples, target_velocity - - # --- 推理逻辑:欧拉步 (Euler Step) --- - def step(self, model_output, sample, dt): - prev_sample = sample + model_output * dt - return prev_sample - -def build_noise_scheduler(args): - return FlowMatchingScheduler() diff --git a/roboimi/gr00t/models/position_encoding.py b/roboimi/gr00t/models/position_encoding.py deleted file mode 100644 index c75733e..0000000 --- a/roboimi/gr00t/models/position_encoding.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -""" -Various positional encodings for the transformer. -""" -import math -import torch -from torch import nn - -from util.misc import NestedTensor - - -class PositionEmbeddingSine(nn.Module): - """ - This is a more standard version of the position embedding, very similar to the one - used by the Attention is all you need paper, generalized to work on images. - """ - def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): - super().__init__() - self.num_pos_feats = num_pos_feats - self.temperature = temperature - self.normalize = normalize - if scale is not None and normalize is False: - raise ValueError("normalize should be True if scale is passed") - if scale is None: - scale = 2 * math.pi - self.scale = scale - - def forward(self, tensor): - x = tensor - # mask = tensor_list.mask - # assert mask is not None - # not_mask = ~mask - - not_mask = torch.ones_like(x[0, [0]]) - y_embed = not_mask.cumsum(1, dtype=torch.float32) - x_embed = not_mask.cumsum(2, dtype=torch.float32) - if self.normalize: - eps = 1e-6 - y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale - x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale - - dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) - dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) - - pos_x = x_embed[:, :, :, None] / dim_t - pos_y = y_embed[:, :, :, None] / dim_t - pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) - pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) - pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) - return pos - - -class PositionEmbeddingLearned(nn.Module): - """ - Absolute pos embedding, learned. - """ - def __init__(self, num_pos_feats=256): - super().__init__() - self.row_embed = nn.Embedding(50, num_pos_feats) - self.col_embed = nn.Embedding(50, num_pos_feats) - self.reset_parameters() - - def reset_parameters(self): - nn.init.uniform_(self.row_embed.weight) - nn.init.uniform_(self.col_embed.weight) - - def forward(self, tensor_list: NestedTensor): - x = tensor_list.tensors - h, w = x.shape[-2:] - i = torch.arange(w, device=x.device) - j = torch.arange(h, device=x.device) - x_emb = self.col_embed(i) - y_emb = self.row_embed(j) - pos = torch.cat([ - x_emb.unsqueeze(0).repeat(h, 1, 1), - y_emb.unsqueeze(1).repeat(1, w, 1), - ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) - return pos - - -def build_position_encoding(args): - N_steps = args.hidden_dim // 2 - if args.position_embedding in ('v2', 'sine'): - # TODO find a better way of exposing other arguments - position_embedding = PositionEmbeddingSine(N_steps, normalize=True) - elif args.position_embedding in ('v3', 'learned'): - position_embedding = PositionEmbeddingLearned(N_steps) - else: - raise ValueError(f"not supported {args.position_embedding}") - - return position_embedding diff --git a/roboimi/gr00t/policy.py b/roboimi/gr00t/policy.py deleted file mode 100644 index 83416d4..0000000 --- a/roboimi/gr00t/policy.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -GR00T Policy wrapper for imitation learning. - -This module provides the gr00tPolicy class that wraps the GR00T model -for training and evaluation in the imitation learning framework. -""" -import torch.nn as nn -from torch.nn import functional as F -from torchvision.transforms import v2 -import torch -from roboimi.gr00t.main import build_gr00t_model_and_optimizer - - -class gr00tPolicy(nn.Module): - """ - GR00T Policy for action prediction using diffusion-based DiT architecture. - - This policy wraps the GR00T model and handles: - - Image resizing to match DINOv2 patch size requirements - - Image normalization (ImageNet stats) - - Training with action chunks and loss computation - - Inference with diffusion sampling - """ - def __init__(self, args_override): - super().__init__() - model, optimizer = build_gr00t_model_and_optimizer(args_override) - self.model = model - self.optimizer = optimizer - - # DINOv2 requires image dimensions to be multiples of patch size (14) - # Common sizes: 224x224, 336x336, etc. (14*16=224, 14*24=336) - self.patch_h = 16 # Number of patches vertically - self.patch_w = 22 # Number of patches horizontally - target_size = (self.patch_h * 14, self.patch_w * 14) # (224, 308) - - # Training transform with data augmentation - self.train_transform = v2.Compose([ - v2.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5), - v2.RandomPerspective(distortion_scale=0.5), - v2.RandomAffine(degrees=10, translate=(0.1, 0.1), scale=(0.9, 1.1)), - v2.GaussianBlur(kernel_size=(9, 9), sigma=(0.1, 2.0)), - v2.Resize(target_size), - v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), - ]) - - # Inference transform (no augmentation) - self.inference_transform = v2.Compose([ - v2.Resize(target_size), - v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), - ]) - - def __call__(self, qpos, image, actions=None, is_pad=None): - """ - Forward pass for training or inference. - - Args: - qpos: Joint positions [B, state_dim] - image: Camera images [B, num_cameras, C, H, W] - actions: Ground truth actions [B, chunk_size, action_dim] (training only) - is_pad: Padding mask [B, chunk_size] (training only) - - Returns: - Training: dict with 'mse' loss - Inference: predicted actions [B, num_queries, action_dim] - """ - # Apply transforms (resize + normalization) - if actions is not None: # training time - image = self.train_transform(image) - else: # inference time - image = self.inference_transform(image) - - if actions is not None: # training time - actions = actions[:, :self.model.num_queries] - is_pad = is_pad[:, :self.model.num_queries] - _, action_loss = self.model(qpos, image, actions, is_pad) - - # Mask out padded positions - mse_loss = (action_loss * ~is_pad.unsqueeze(-1)).mean() - - loss_dict = { - 'loss': mse_loss - } - return loss_dict - else: # inference time - a_hat, _ = self.model(qpos, image) - return a_hat - - def configure_optimizers(self): - """Return the optimizer for training.""" - return self.optimizer