Tutorial: Action-Conditioned 3D Human Motion Synthesis with Transformer VAE (ACTOR)

ACTOR learns an action-aware latent representation for human motions by training a generative variational autoencoder (VAE). By sampling from this latent space and querying a certain duration through a series of positional encodings, ACTOR synthesizes variable-length motion sequences conditioned on a categorical action. Specifically, a Transformer-based architecture is designed, for encoding and decoding a sequence of parametric SMPL human body models estimated from action recognition datasets.

Dataset for ACTOR

To get the pre-process the dataset, please refer to the this Github repository and agree to the license. There following code shows examples from HumanAct12 dataset.

[1]:
# Set data path
# datapath = "F://research/ACTOR/data/HumanAct12Poses/"
datapath = "E://researches/GenMotion/thirdParty/HumanAct12Poses/"
[2]:
import torch
import os
[3]:
from genmotion.algorithm.action_conditioned.params import HumanAct12Params
from genmotion.algorithm.action_conditioned.data_utils import get_datasets

from genmotion.algorithm.action_conditioned.utils.tensors import collate

Training ACTOR

[ ]:
# load parameters
parameters = vars(HumanAct12Params())
[ ]:
# get datasets
datasets = get_datasets(datapath, parameters)
print("dataset length: ", {key: len(val) for key, val in datasets.items()})
[ ]:
# load model
assert parameters["modeltype"] == 'cvae'
assert parameters["archiname"] == "transformer"

from genmotion.algorithm.action_conditioned.models.architectures.transformer import Encoder_TRANSFORMER, Decoder_TRANSFORMER
from genmotion.algorithm.action_conditioned.models.modeltype.cvae import CVAE

encoder = Encoder_TRANSFORMER(**parameters)
decoder = Decoder_TRANSFORMER(**parameters)

parameters["outputxyz"] = "rcxyz" in parameters["lambdas"]

model = CVAE(encoder, decoder, **parameters).to(parameters["device"])
[ ]:
# optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=parameters["lr"])
print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0))
# print("Training model..")
[ ]:
dataset = datasets["train"]
train_iterator = torch.utils.data.DataLoader(dataset, batch_size=parameters["batch_size"],
                                             shuffle=True, num_workers=8, collate_fn=collate)
[ ]:
from genmotion.algorithm.action_conditioned.trainer import train
[ ]:
model.device
[ ]:
epochs = 1 # total number of training epochs
[ ]:
# for epoch in range(epochs):
#     dict_loss = train(model, optimizer, train_iterator, model.device)

#     for key in dict_loss.keys():
#         dict_loss[key] /= len(train_iterator)
#         print(f"Loss/{key}", dict_loss[key], "f{epoch}")

Sample ACTOR

[5]:
# load parameters
parameters = vars(HumanAct12Params(mode="sample"))
[6]:
# load model
assert parameters["modeltype"] == 'cvae'
assert parameters["archiname"] == "transformer"

from genmotion.algorithm.action_conditioned.models.architectures.transformer import Encoder_TRANSFORMER, Decoder_TRANSFORMER
from genmotion.algorithm.action_conditioned.models.modeltype.cvae import CVAE

encoder = Encoder_TRANSFORMER(**parameters)
decoder = Decoder_TRANSFORMER(**parameters)

parameters["outputxyz"] = "rcxyz" in parameters["lambdas"]

model = CVAE(encoder, decoder, **parameters).to(parameters["device"])
[7]:
folder = "../pretrained/action_conditioned/humanact12/"
checkpointname = "checkpoint_5000.pth.tar"
[8]:
print("Restore weights..")
checkpointpath = os.path.join(folder, checkpointname)
state_dict = torch.load(checkpointpath, map_location=parameters["device"])
model.load_state_dict(state_dict)
Restore weights..
[8]:
<All keys matched successfully>
[9]:
num_classes = parameters["num_classes"]
classes = torch.arange(num_classes)
[10]:
gendurations = torch.tensor([parameters["num_frames"] for cl in classes], dtype=int)
[11]:
noise_same_action = "interpolate"
noise_diff_action = "random"
# noise_same_action = "random"
[12]:
model.device
[12]:
device(type='cuda')
[14]:
parameters["num_samples_per_action"]
[14]:
10
[15]:
# Generate the new data
generation = model.generate(classes, gendurations, nspa=2,
                            noise_same_action=noise_same_action,
                            noise_diff_action=noise_diff_action,
                            fact=parameters["fact_latent"])
[20]:
generation[
    'output_xyz'
].shape
[20]:
torch.Size([24, 24, 3, 60])