Source code for nerfstudio.data.dataparsers.nuscenes_dataparser

# Copyright 2022 the Regents of the University of California, Nerfstudio Team and contributors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Data parser for NuScenes dataset"""

import math
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Literal, Optional, Tuple, Type

import numpy as np
import pyquaternion
import torch

from nerfstudio.cameras.cameras import Cameras, CameraType
from nerfstudio.data.dataparsers.base_dataparser import DataParser, DataParserConfig, DataparserOutputs
from nerfstudio.data.scene_box import SceneBox


[docs]def rotation_translation_to_pose(r_quat, t_vec): """Convert quaternion rotation and translation vectors to 4x4 matrix""" pose = np.eye(4) # NB: Nuscenes recommends pyquaternion, which uses scalar-first format (w x y z) # https://github.com/nutonomy/nuscenes-devkit/issues/545#issuecomment-766509242 # https://github.com/KieranWynn/pyquaternion/blob/99025c17bab1c55265d61add13375433b35251af/pyquaternion/quaternion.py#L299 # https://fzheng.me/2017/11/12/quaternion_conventions_en/ pose[:3, :3] = pyquaternion.Quaternion(r_quat).rotation_matrix pose[:3, 3] = t_vec return pose
[docs]@dataclass class NuScenesDataParserConfig(DataParserConfig): """NuScenes dataset config. NuScenes (https://www.nuscenes.org/nuscenes) is an autonomous driving dataset containing 1000 20s clips. Each clip was recorded with a suite of sensors including 6 surround cameras. It also includes 3D cuboid annotations around objects. We optionally use these cuboids to mask dynamic objects by specifying the mask_dir flag. To create these masks use nerfstudio/scripts/datasets/process_nuscenes_masks.py. """ _target: Type = field(default_factory=lambda: NuScenes) """target class to instantiate""" data: Path = Path("scene-0103") # TODO: rename to scene but keep checkpoint saving name? """Name of the scene.""" data_dir: Path = Path("/mnt/local/NuScenes") """Path to NuScenes dataset.""" version: Literal["v1.0-mini", "v1.0-trainval"] = "v1.0-mini" """Dataset version.""" cameras: Tuple[Literal["FRONT", "FRONT_LEFT", "FRONT_RIGHT", "BACK", "BACK_LEFT", "BACK_RIGHT"], ...] = ("FRONT",) """Which cameras to use.""" mask_dir: Optional[Path] = None """Path to masks of dynamic objects.""" train_split_fraction: float = 0.9 """The percent of images to use for training. The remaining images are for eval.""" verbose: bool = False """Load dataset with verbose messaging"""
[docs]@dataclass class NuScenes(DataParser): """NuScenes DatasetParser""" config: NuScenesDataParserConfig def _generate_dataparser_outputs(self, split="train"): # nuscenes is slow to import, so we only do it if we need it. from nuscenes.nuscenes import NuScenes as NuScenesDatabase nusc = NuScenesDatabase( version=self.config.version, dataroot=str(self.config.data_dir.absolute()), verbose=self.config.verbose, ) cameras = ["CAM_" + camera for camera in self.config.cameras] assert len(cameras) == 1, ( "waiting on multiple camera support" ) # TODO: remove once multiple cameras are supported # get samples for scene samples = [ samp for samp in nusc.sample if nusc.get("scene", samp["scene_token"])["name"] == str(self.config.data) ] # sort by timestamp (only to make chronological viz easier) samples.sort(key=lambda x: (x["scene_token"], x["timestamp"])) transform1 = np.array( [ [0, -1, 0, 0], [0, 0, -1, 0], [1, 0, 0, 0], [0, 0, 0, 1], ] ) transform2 = np.array( [ [0, 0, 1, 0], [0, 1, 0, 0], [-1, 0, 0, 0], [0, 0, 0, 1], ] ) # get image filenames and camera data image_filenames = [] mask_filenames = [] mask_dir = self.config.mask_dir if self.config.mask_dir is not None else Path("") intrinsics = [] poses = [] for sample in samples: for camera in cameras: camera_data = nusc.get("sample_data", sample["data"][camera]) calibrated_sensor_data = nusc.get("calibrated_sensor", camera_data["calibrated_sensor_token"]) ego_pose_data = nusc.get("ego_pose", camera_data["ego_pose_token"]) ego_pose = rotation_translation_to_pose(ego_pose_data["rotation"], ego_pose_data["translation"]) cam_pose = rotation_translation_to_pose( calibrated_sensor_data["rotation"], calibrated_sensor_data["translation"] ) pose = ego_pose @ cam_pose # rotate to opencv frame pose = transform1 @ pose # convert from opencv camera to nerfstudio camera pose[0:3, 1:3] *= -1 pose = pose[np.array([1, 0, 2, 3]), :] pose[2, :] *= -1 # rotate to z-up in viewer pose = transform2 @ pose image_filenames.append(self.config.data_dir / camera_data["filename"]) mask_filenames.append( mask_dir / "masks" / camera / os.path.split(camera_data["filename"])[1].replace("jpg", "png") ) intrinsics.append(calibrated_sensor_data["camera_intrinsic"]) poses.append(pose) poses = torch.from_numpy(np.stack(poses).astype(np.float32)) intrinsics = torch.from_numpy(np.array(intrinsics).astype(np.float32)) # center poses poses[:, :3, 3] -= poses[:, :3, 3].mean(dim=0) # scale poses poses[:, :3, 3] /= poses[:, :3, 3].abs().max() # filter image_filenames and poses based on train/eval split percentage num_snapshots = len(samples) num_train_snapshots = math.ceil(num_snapshots * self.config.train_split_fraction) num_eval_snapshots = num_snapshots - num_train_snapshots i_all = np.arange(num_snapshots) i_train = np.linspace( 0, num_snapshots - 1, num_train_snapshots, dtype=int ) # equally spaced training snapshots starting and ending at 0 and num_images-1 i_eval = np.setdiff1d(i_all, i_train) # eval images are the remaining images assert len(i_eval) == num_eval_snapshots i_train = (i_train[None, :] * len(cameras) + np.arange(len(cameras))[:, None]).ravel() i_eval = (i_eval[None, :] * len(cameras) + np.arange(len(cameras))[:, None]).ravel() if split == "train": indices = i_train elif split in ["val", "test"]: indices = i_eval else: raise ValueError(f"Unknown dataparser split {split}") # Choose image_filenames and poses based on split, but after auto orient and scaling the poses. image_filenames = [image_filenames[i] for i in indices] mask_filenames = [mask_filenames[i] for i in indices] intrinsics = intrinsics[indices] poses = poses[indices] # in x,y,z order # assumes that the scene is centered at the origin aabb_scale = 1.0 scene_box = SceneBox( aabb=torch.tensor( [[-aabb_scale, -aabb_scale, -aabb_scale], [aabb_scale, aabb_scale, aabb_scale]], dtype=torch.float32 ) ) cameras = Cameras( fx=intrinsics[:, 0, 0].detach().clone(), fy=intrinsics[:, 1, 1].detach().clone(), cx=intrinsics[:, 0, 2].detach().clone(), cy=intrinsics[:, 1, 2].detach().clone(), height=900, width=1600, camera_to_worlds=poses[:, :3, :4], camera_type=CameraType.PERSPECTIVE, ) dataparser_outputs = DataparserOutputs( image_filenames=image_filenames, cameras=cameras, scene_box=scene_box, mask_filenames=mask_filenames if self.config.mask_dir is not None else None, ) return dataparser_outputs