Source code for nerfstudio.data.dataparsers.nuscenes_dataparser

# Copyright 2022 the Regents of the University of California, Nerfstudio Team and contributors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Data parser for NuScenes dataset"""

import math
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Literal, Optional, Tuple, Type

import numpy as np
import pyquaternion
import torch

from nerfstudio.cameras.cameras import Cameras, CameraType
from nerfstudio.data.dataparsers.base_dataparser import DataParser, DataParserConfig, DataparserOutputs
from nerfstudio.data.scene_box import SceneBox


[docs]def rotation_translation_to_pose(r_quat, t_vec):
    """Convert quaternion rotation and translation vectors to 4x4 matrix"""

    pose = np.eye(4)

    # NB: Nuscenes recommends pyquaternion, which uses scalar-first format (w x y z)
    # https://github.com/nutonomy/nuscenes-devkit/issues/545#issuecomment-766509242
    # https://github.com/KieranWynn/pyquaternion/blob/99025c17bab1c55265d61add13375433b35251af/pyquaternion/quaternion.py#L299
    # https://fzheng.me/2017/11/12/quaternion_conventions_en/
    pose[:3, :3] = pyquaternion.Quaternion(r_quat).rotation_matrix

    pose[:3, 3] = t_vec
    return pose


[docs]@dataclass
class NuScenesDataParserConfig(DataParserConfig):
    """NuScenes dataset config.
    NuScenes (https://www.nuscenes.org/nuscenes) is an autonomous driving dataset containing 1000 20s clips.
    Each clip was recorded with a suite of sensors including 6 surround cameras.
    It also includes 3D cuboid annotations around objects.
    We optionally use these cuboids to mask dynamic objects by specifying the mask_dir flag.
    To create these masks use nerfstudio/scripts/datasets/process_nuscenes_masks.py.
    """

    _target: Type = field(default_factory=lambda: NuScenes)
    """target class to instantiate"""
    data: Path = Path("scene-0103")  # TODO: rename to scene but keep checkpoint saving name?
    """Name of the scene."""
    data_dir: Path = Path("/mnt/local/NuScenes")
    """Path to NuScenes dataset."""
    version: Literal["v1.0-mini", "v1.0-trainval"] = "v1.0-mini"
    """Dataset version."""
    cameras: Tuple[Literal["FRONT", "FRONT_LEFT", "FRONT_RIGHT", "BACK", "BACK_LEFT", "BACK_RIGHT"], ...] = ("FRONT",)
    """Which cameras to use."""
    mask_dir: Optional[Path] = None
    """Path to masks of dynamic objects."""

    train_split_fraction: float = 0.9
    """The percent of images to use for training. The remaining images are for eval."""

    verbose: bool = False
    """Load dataset with verbose messaging"""


[docs]@dataclass
class NuScenes(DataParser):
    """NuScenes DatasetParser"""

    config: NuScenesDataParserConfig

    def _generate_dataparser_outputs(self, split="train"):
        # nuscenes is slow to import, so we only do it if we need it.
        from nuscenes.nuscenes import NuScenes as NuScenesDatabase

        nusc = NuScenesDatabase(
            version=self.config.version,
            dataroot=str(self.config.data_dir.absolute()),
            verbose=self.config.verbose,
        )
        cameras = ["CAM_" + camera for camera in self.config.cameras]

        assert len(cameras) == 1, (
            "waiting on multiple camera support"
        )  # TODO: remove once multiple cameras are supported

        # get samples for scene
        samples = [
            samp for samp in nusc.sample if nusc.get("scene", samp["scene_token"])["name"] == str(self.config.data)
        ]

        # sort by timestamp (only to make chronological viz easier)
        samples.sort(key=lambda x: (x["scene_token"], x["timestamp"]))

        transform1 = np.array(
            [
                [0, -1, 0, 0],
                [0, 0, -1, 0],
                [1, 0, 0, 0],
                [0, 0, 0, 1],
            ]
        )
        transform2 = np.array(
            [
                [0, 0, 1, 0],
                [0, 1, 0, 0],
                [-1, 0, 0, 0],
                [0, 0, 0, 1],
            ]
        )

        # get image filenames and camera data
        image_filenames = []
        mask_filenames = []
        mask_dir = self.config.mask_dir if self.config.mask_dir is not None else Path("")
        intrinsics = []
        poses = []
        for sample in samples:
            for camera in cameras:
                camera_data = nusc.get("sample_data", sample["data"][camera])
                calibrated_sensor_data = nusc.get("calibrated_sensor", camera_data["calibrated_sensor_token"])
                ego_pose_data = nusc.get("ego_pose", camera_data["ego_pose_token"])

                ego_pose = rotation_translation_to_pose(ego_pose_data["rotation"], ego_pose_data["translation"])
                cam_pose = rotation_translation_to_pose(
                    calibrated_sensor_data["rotation"], calibrated_sensor_data["translation"]
                )
                pose = ego_pose @ cam_pose

                # rotate to opencv frame
                pose = transform1 @ pose

                # convert from opencv camera to nerfstudio camera
                pose[0:3, 1:3] *= -1
                pose = pose[np.array([1, 0, 2, 3]), :]
                pose[2, :] *= -1

                # rotate to z-up in viewer
                pose = transform2 @ pose

                image_filenames.append(self.config.data_dir / camera_data["filename"])
                mask_filenames.append(
                    mask_dir / "masks" / camera / os.path.split(camera_data["filename"])[1].replace("jpg", "png")
                )
                intrinsics.append(calibrated_sensor_data["camera_intrinsic"])
                poses.append(pose)
        poses = torch.from_numpy(np.stack(poses).astype(np.float32))
        intrinsics = torch.from_numpy(np.array(intrinsics).astype(np.float32))

        # center poses
        poses[:, :3, 3] -= poses[:, :3, 3].mean(dim=0)

        # scale poses
        poses[:, :3, 3] /= poses[:, :3, 3].abs().max()

        # filter image_filenames and poses based on train/eval split percentage
        num_snapshots = len(samples)
        num_train_snapshots = math.ceil(num_snapshots * self.config.train_split_fraction)
        num_eval_snapshots = num_snapshots - num_train_snapshots
        i_all = np.arange(num_snapshots)
        i_train = np.linspace(
            0, num_snapshots - 1, num_train_snapshots, dtype=int
        )  # equally spaced training snapshots starting and ending at 0 and num_images-1
        i_eval = np.setdiff1d(i_all, i_train)  # eval images are the remaining images
        assert len(i_eval) == num_eval_snapshots
        i_train = (i_train[None, :] * len(cameras) + np.arange(len(cameras))[:, None]).ravel()
        i_eval = (i_eval[None, :] * len(cameras) + np.arange(len(cameras))[:, None]).ravel()
        if split == "train":
            indices = i_train
        elif split in ["val", "test"]:
            indices = i_eval
        else:
            raise ValueError(f"Unknown dataparser split {split}")

        # Choose image_filenames and poses based on split, but after auto orient and scaling the poses.
        image_filenames = [image_filenames[i] for i in indices]
        mask_filenames = [mask_filenames[i] for i in indices]
        intrinsics = intrinsics[indices]
        poses = poses[indices]

        # in x,y,z order
        # assumes that the scene is centered at the origin
        aabb_scale = 1.0
        scene_box = SceneBox(
            aabb=torch.tensor(
                [[-aabb_scale, -aabb_scale, -aabb_scale], [aabb_scale, aabb_scale, aabb_scale]], dtype=torch.float32
            )
        )

        cameras = Cameras(
            fx=intrinsics[:, 0, 0].detach().clone(),
            fy=intrinsics[:, 1, 1].detach().clone(),
            cx=intrinsics[:, 0, 2].detach().clone(),
            cy=intrinsics[:, 1, 2].detach().clone(),
            height=900,
            width=1600,
            camera_to_worlds=poses[:, :3, :4],
            camera_type=CameraType.PERSPECTIVE,
        )

        dataparser_outputs = DataparserOutputs(
            image_filenames=image_filenames,
            cameras=cameras,
            scene_box=scene_box,
            mask_filenames=mask_filenames if self.config.mask_dir is not None else None,
        )
        return dataparser_outputs