Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ curl http://localhost:18888/v1/chat/completions \

</details>

| Model | Size | Context | Weights | Serving |
|--------------|------|---------|--------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| OpenChat 3.2 SUPER | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2_super) | `python -m ochat.serving.openai_api_server --model-type openchat_v3.2 --model openchat/openchat_v3.2_super --engine-use-ray --worker-use-ray --max-num-batched-tokens 5120` |
| Model | Size | Context | Weights | Serving |
|--------------------|------|---------|--------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------|
| OpenChat 3.2 SUPER | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2_super) | `python -m ochat.serving.openai_api_server --model openchat/openchat_v3.2_super --engine-use-ray --worker-use-ray` |

For inference with Huggingface Transformers (slow and not recommended), follow the conversation template provided below:

Expand Down Expand Up @@ -276,10 +276,11 @@ To run the models on multiple GPUs with smaller VRAM, you can enable tensor para
<details>
<summary>OpenChat V3 (click to expand)</summary>

| Model | Size | Context | Weights | Serving |
|--------------|------|---------|--------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| OpenChat 3.2 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2) | `python -m ochat.serving.openai_api_server --model-type openchat_v3.2 --model openchat/openchat_v3.2 --engine-use-ray --worker-use-ray --max-num-batched-tokens 5120` |
| OpenChat 3.1 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.1) | `python -m ochat.serving.openai_api_server --model-type openchat_v3.1_llama2 --model openchat/openchat_v3.1 --engine-use-ray --worker-use-ray --max-num-batched-tokens 5120` |
| Model | Size | Context | Weights | Serving |
|--------------|------|---------|--------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------|
| OpenChat 3.2 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2) | `python -m ochat.serving.openai_api_server --model openchat/openchat_v3.2 --engine-use-ray --worker-use-ray` |
| OpenChat 3.1 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.1) | `python -m ochat.serving.openai_api_server --model openchat/openchat_v3.1 --engine-use-ray --worker-use-ray` |

</details>

## Acknowledgements
Expand Down
10 changes: 5 additions & 5 deletions ochat/config/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def _v3_condition(props):
MODEL_CONFIG_MAP = {
# OpenChat V3.2
"openchat_v3.2": ModelConfig(
name="OpenChat V3.2",
name="OpenChat V3.2 Llama 2",

# Prompt
role_prefix=_v3_2_conditional_prefix,
Expand Down Expand Up @@ -174,8 +174,8 @@ def _v3_condition(props):
),

# OpenChat V2
"openchat_v2": ModelConfig(
name="OpenChat_v2",
"openchat_v2_llama2": ModelConfig(
name="OpenChat V2 Llama 2",

# Prompt
role_prefix=_v2_conditional_prefix,
Expand All @@ -184,7 +184,7 @@ def _v3_condition(props):
bos_token="<s>",

# Tokenize
model_max_context=2048,
model_max_context=4096,
model_create=partial(ochat.models.LlamaForCausalLM.from_pretrained,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16),
Expand All @@ -195,7 +195,7 @@ def _v3_condition(props):

# OpenChat
"openchat_llama2": ModelConfig(
name="OpenChat Llama 2",
name="OpenChat V1 Llama 2",

# Prompt
role_prefix={
Expand Down
10 changes: 9 additions & 1 deletion ochat/data/filter_sharegpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@
import numpy as np


def subsample_mask(seed: int, n: int, p: float):
mask = np.zeros((len(filtered_samples), ), np.bool_)
perm = np.random.default_rng(seed=seed).permutation(n)

mask[perm[:round(n * p)]] = True
return mask


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--in-file", type=str, required=True)
Expand Down Expand Up @@ -37,7 +45,7 @@

# Subsampling
if args.subsample < 1.0:
keep = np.random.default_rng(seed=args.subsample_seed).random(len(filtered_samples)) < args.subsample
keep = subsample_mask(args.subsample_seed, len(filtered_samples), args.subsample)
filtered_samples = [s for s, k in zip(filtered_samples, keep) if k]

# Print
Expand Down
8 changes: 4 additions & 4 deletions ochat/models/unpadded_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def forward(
nz_hidden_states: torch.Tensor,
nz_position_ids: torch.LongTensor,
cu_seqlens: torch.Tensor,
max_seqlen: torch.Tensor
max_seqlen: int
) -> torch.Tensor:
# nz_hidden_states: [nnz, num_heads, head_dim]
# nz_position_ids: [nnz]
Expand Down Expand Up @@ -213,7 +213,7 @@ def forward(
nz_hidden_states: torch.Tensor,
nz_position_ids: torch.Tensor,
cu_seqlens: torch.Tensor,
max_seqlen: torch.Tensor
max_seqlen: int
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
# Self Attention
residual = nz_hidden_states
Expand Down Expand Up @@ -298,7 +298,7 @@ def forward(
nz_input_ids: torch.Tensor,
nz_position_ids: torch.Tensor,
cu_seqlens: torch.Tensor,
max_seqlen: torch.Tensor,
max_seqlen: int,
) -> torch.Tensor:
nz_hidden_states = self.embed_tokens(nz_input_ids)
cos_sin = self.rotary_emb()
Expand Down Expand Up @@ -375,7 +375,7 @@ def forward(
nz_input_ids: torch.Tensor,
nz_position_ids: torch.Tensor,
cu_seqlens: torch.Tensor,
max_seqlen: torch.Tensor,
max_seqlen: int,
# Unpadded labels
nz_shifted_label_ids: Optional[torch.Tensor] = None,
nz_shifted_loss_weights: Optional[torch.Tensor] = None
Expand Down
29 changes: 17 additions & 12 deletions ochat/serving/openai_api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
from ochat.config.model_config import MODEL_CONFIG_MAP
from ochat.serving import openai_api_protocol, async_tokenizer

from transformers.utils.hub import cached_file


TIMEOUT_KEEP_ALIVE = 5 # seconds

Expand All @@ -37,7 +39,6 @@
class ModelConfig:
name: str = None

eot_token: str = None
max_length: int = None
stream_period: int = None

Expand Down Expand Up @@ -65,7 +66,7 @@ async def validation_exception_handler(request, exc): # pylint: disable=unused-

async def check_api_key(
auth: Optional[HTTPAuthorizationCredentials] = fastapi.Depends(HTTPBearer(auto_error=False)),
) -> str:
):
if not model.api_keys:
return

Expand Down Expand Up @@ -287,8 +288,6 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
parser = argparse.ArgumentParser(description="OpenChat OpenAI-Compatible RESTful API server.")

# Model
parser.add_argument("--model-type", type=str, required=True, help="Type of model")

parser.add_argument("--stream-period", type=int, default=6, help="Number of tokens per stream event")
parser.add_argument("--api-keys", type=str, nargs="*", default=[], help="Allowed API Keys. Leave blank to not verify")

Expand Down Expand Up @@ -327,22 +326,28 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
backupCount=args.log_max_count)
)

# Load model
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngine.from_engine_args(engine_args)
engine_model_config = asyncio.run(engine.get_model_config())
# Load model type
with open(cached_file(path_or_repo_id=args.model, filename="openchat.json"), "r") as f:
model_type = json.load(f)["model_type"]

# Load tokenizer
tokenizer = async_tokenizer.AsyncTokenizer.remote(args.model_type, args.model)
tokenizer = async_tokenizer.AsyncTokenizer.remote(model_type, args.model)

# Model config
model.name = args.model_type
model.eot_token = MODEL_CONFIG_MAP[args.model_type].eot_token
model.max_length = MODEL_CONFIG_MAP[args.model_type].model_max_context
model.name = model_type
model.max_length = MODEL_CONFIG_MAP[model_type].model_max_context

model.stream_period = args.stream_period
model.api_keys = args.api_keys

# Set max num batched tokens
args.max_num_batched_tokens = max(args.max_num_batched_tokens, model.max_length)

# Load model engine
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngine.from_engine_args(engine_args)
engine_model_config = asyncio.run(engine.get_model_config())

# Run
uvicorn.run(app,
host=args.host,
Expand Down
45 changes: 29 additions & 16 deletions ochat/training_deepspeed/train.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import os
import math
import json
from functools import partial

import torch
Expand Down Expand Up @@ -41,7 +42,6 @@ def parse_args():
parser.add_argument("--local_rank", type=int, required=True)

# Model type and data
parser.add_argument("--model_type", type=str, required=True)
parser.add_argument("--model_path", type=str, required=True)
parser.add_argument("--data_path", type=str, required=True)
parser.add_argument("--save_path", type=str, required=True)
Expand Down Expand Up @@ -77,6 +77,7 @@ def create_dataset(args, split_name):
_rank0_print (f"Skipping loading {split_name}")
return None

_rank0_print(f"Loading {split_name} data from {filename}...")
return ParquetDataset(filename)


Expand Down Expand Up @@ -111,20 +112,18 @@ def batch_to_tensor(batch, int_dtype=torch.long, loss_dtype=torch.bfloat16):
batch_tensor[k] = torch.from_numpy(np.concatenate(batch.column(k).to_numpy())).to(dtype)

# cu seqlens
batch_tensor["max_seqlen"] = torch.max(batch_tensor["seqlens"])
batch_tensor["cu_seqlens"] = torch.nn.functional.pad(batch_tensor["seqlens"].cumsum(-1, dtype=torch.int32), (1, 0))

del batch_tensor["seqlens"]
# batch info
batch_info = {"max_seqlen": torch.max(batch_tensor["seqlens"]).item()}

# inputs
return batch_tensor
del batch_tensor["seqlens"]

return batch_tensor, batch_info

def create_distributed_dataloader(args, data):
# Check data
assert data.metadata["model_type"] == args.model_type, \
f"The dataset is for {data.metadata['model_type']}, but you specified {args.model_type} for training."

def create_distributed_dataloader(args, data):
# Multipack dataloader
args.batch_max_len = args.batch_size_per_gpu * MODEL_CONFIG_MAP[args.model_type].model_max_context

Expand All @@ -143,6 +142,8 @@ def create_distributed_dataloader(args, data):
def create_model(args):
global LOCAL_RANK

_rank0_print(f"Loading model {args.model_type} from {args.model_path}...")

# Create model + optimizer + lr scheduler
model = MODEL_CONFIG_MAP[args.model_type].model_create(args.model_path)
# Model to assigned cuda device
Expand Down Expand Up @@ -198,6 +199,14 @@ def save_tokenizer(args, save_path):
tokenizer.save_pretrained(save_path)


def save_openchat_metadata(args, epoch, save_path):
metadata = vars(args)
metadata["epoch"] = epoch

with open(os.path.join(save_path, "openchat.json"), "w") as f:
json.dump(metadata, f, default=lambda o: "<non-serializable>")


def calculate_auto_lr(lr, batch_max_len, train_dataset):
if lr is not None:
return lr
Expand Down Expand Up @@ -227,10 +236,12 @@ def train():
LOCAL_RANK = args.local_rank

# Dataset
_rank0_print("Loading data...")
train_dataset = create_dataset(args, "train")
eval_dataset = create_dataset(args, "eval")

# Load model type
args.model_type = train_dataset.metadata["model_type"]

# Data Loader
train_loader = create_distributed_dataloader(args, train_dataset)
train_total_steps = args.epochs * train_loader.num_batches()
Expand All @@ -243,7 +254,6 @@ def train():
args.lr = calculate_auto_lr(args.lr, args.batch_max_len, train_dataset)

# Model
_rank0_print("Loading model...")
model_engine, optimizer = create_model(args)

# LR Scheduler
Expand All @@ -265,16 +275,16 @@ def train():
model_engine.train()

train_loader.set_epoch(epoch)
for batch, all_numseq, cur_numseq in train_loader:
for (batch_tensor, batch_info), all_numseq, cur_numseq in train_loader:
step += 1
if step > train_total_steps: # At most train_total_steps
break

# To device
batch = {k: (v.to(args.device) if v is not None else None) for k, v in batch.items()}
batch_tensor = {k: (v.to(args.device) if v is not None else None) for k, v in batch_tensor.items()}

# Update
loss = (1 / all_numseq) * model_engine(**batch).loss
loss = (1 / all_numseq) * model_engine(**batch_tensor, **batch_info).loss

model_engine.backward(loss)

Expand Down Expand Up @@ -304,12 +314,12 @@ def train():

eval_loader.set_epoch(epoch)
with torch.inference_mode():
for batch, all_numseq, cur_numseq in eval_loader:
for (batch_tensor, batch_info), all_numseq, cur_numseq in eval_loader:
# To device
batch = {k: (v.to(args.device) if v is not None else None) for k, v in batch.items()}
batch_tensor = {k: (v.to(args.device) if v is not None else None) for k, v in batch_tensor.items()}

# Eval
eval_loss = (1 / all_numseq) * model_engine(**batch).loss
eval_loss = (1 / all_numseq) * model_engine(**batch_tensor, **batch_info).loss

# Accumulate eval loss
eval_total_loss.add_(eval_loss)
Expand Down Expand Up @@ -337,6 +347,9 @@ def train():
# Also save tokenizer from base model
save_tokenizer(args, save_path)

# Write metadata
save_openchat_metadata(args, epoch, save_path)


if __name__ == "__main__":
train()