Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,13 @@ phone = ["hebi-py>=2.8.0,<2.12.0", "teleop>=0.1.0,<0.2.0", "fastapi<1.0"]

# Policies
pi = ["transformers @ git+https://github.com/huggingface/transformers.git@fix/lerobot_openpi"]
smolvla = ["lerobot[transformers-dep]", "num2words>=0.5.14,<0.6.0", "accelerate>=1.7.0,<2.0.0", "safetensors>=0.4.3,<1.0.0"]
smolvla = [
"lerobot[transformers-dep]",
"num2words>=0.5.14,<0.6.0",
"accelerate>=1.7.0,<2.0.0",
"safetensors>=0.4.3,<1.0.0",
"peft>=0.13.0,<1.0.0"
]
groot = [
"lerobot[transformers-dep]",
"peft>=0.13.0,<1.0.0",
Expand Down
8 changes: 8 additions & 0 deletions src/lerobot/policies/smolvla/configuration_smolvla.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@ class SmolVLAConfig(PreTrainedConfig):

# Real-Time Chunking (RTC) configuration
rtc_config: RTCConfig | None = None

# LoRA
lora_r: int = 8
lora_alpha: int = 16
lora_dropout: float = 0.05
lora_target_modules: tuple[str, ...] = ("q_proj", "k_proj", "v_proj", "o_proj")
lora_on_vlm: bool = False
lora_on_expert: bool = False

def __post_init__(self):
super().__post_init__()
Expand Down
6 changes: 6 additions & 0 deletions src/lerobot/policies/smolvla/modeling_smolvla.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,12 @@ def __init__(self, config: SmolVLAConfig, rtc_processor: RTCProcessor | None = N
num_vlm_layers=self.config.num_vlm_layers,
self_attn_every_n_layers=self.config.self_attn_every_n_layers,
expert_width_multiplier=self.config.expert_width_multiplier,
lora_r=self.config.lora_r,
lora_alpha=self.config.lora_alpha,
lora_dropout=self.config.lora_dropout,
lora_target_modules=self.config.lora_target_modules,
lora_on_vlm=self.config.lora_on_vlm,
lora_on_expert=self.config.lora_on_expert,
)
self.state_proj = nn.Linear(
self.config.max_state_dim, self.vlm_with_expert.config.text_config.hidden_size
Expand Down
70 changes: 65 additions & 5 deletions src/lerobot/policies/smolvla/smolvlm_with_expert.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
AutoProcessor,
SmolVLMForConditionalGeneration,
)
from peft import LoraConfig, get_peft_model, TaskType


def apply_rope(x, positions, max_wavelength=10_000):
Expand Down Expand Up @@ -71,8 +72,22 @@ def __init__(
self_attn_every_n_layers: int = -1,
expert_width_multiplier: float = 0.5,
device: str = "auto",
lora_r: int = 8,
lora_alpha: int = 16,
lora_dropout: float = 0.05,
lora_target_modules: tuple[str, ...] = ("q_proj", "k_proj", "v_proj", "o_proj"),
lora_on_vlm: bool = True,
lora_on_expert: bool = False,
):
super().__init__()

self.lora_r = lora_r
self.lora_alpha = lora_alpha
self.lora_dropout = lora_dropout
self.lora_target_modules = lora_target_modules
self.lora_on_vlm = lora_on_vlm
self.lora_on_expert = lora_on_expert

if load_vlm_weights:
print(f"Loading {model_id} weights ...")
self.vlm = AutoModelForImageTextToText.from_pretrained(
Expand Down Expand Up @@ -131,10 +146,44 @@ def __init__(
self.train_expert_only = train_expert_only
self.attention_mode = attention_mode
self.expert_hidden_size = lm_expert_config.hidden_size

if self.train_expert_only and self.lora_on_vlm:
raise ValueError("Cannot have both train_expert_only and lora_on_vlm set to True.")

if self.freeze_vision_encoder and self.lora_on_vlm:
raise ValueError("Cannot have both freeze_vision_encoder and lora_on_vlm set to True.")

if self.lora_on_vlm:
vlm_lora_config = LoraConfig(
r=self.lora_r,
lora_alpha=self.lora_alpha,
target_modules=list(self.lora_target_modules),
lora_dropout=self.lora_dropout,
bias="none",
task_type=TaskType.FEATURE_EXTRACTION,
)
self.vlm = get_peft_model(self.vlm, vlm_lora_config)

if self.lora_on_expert:
expert_lora_config = LoraConfig(
r=self.lora_r,
lora_alpha=self.lora_alpha,
target_modules=list(self.lora_target_modules),
lora_dropout=self.lora_dropout,
bias="none",
task_type=TaskType.FEATURE_EXTRACTION,
)
self.lm_expert = get_peft_model(self.lm_expert, expert_lora_config)
self.set_requires_grad()

def get_vlm_model(self):
return self.vlm.model
vlm = self.vlm

if self.lora_on_vlm:
# PeftModelForFeatureExtraction -> LoraModel -> SmolVLMForConditionalGeneration
return vlm.base_model.model.model
else:
return vlm.model

def set_requires_grad(self):
if self.freeze_vision_encoder:
Expand All @@ -161,13 +210,24 @@ def set_requires_grad(self):
frozen_layers.append(f"text_model.model.layers.{layer}.")

for name, params in self.vlm.named_parameters():
if "lora_" in name:
continue
if any(k in name for k in frozen_layers):
params.requires_grad = False
# To avoid unused params issue with distributed training
for name, params in self.lm_expert.named_parameters():
if "lm_head" in name:
params.requires_grad = False

if self.lora_on_expert:
self.lm_expert.eval()
for name, p in self.lm_expert.named_parameters():
if "lora_" in name:
p.requires_grad = True
else:
p.requires_grad = False
else:
# To avoid unused params issue with distributed training
for name, params in self.lm_expert.named_parameters():
if "lm_head" in name:
params.requires_grad = False

def train(self, mode: bool = True):
super().train(mode)

Expand Down