Source code for nerfstudio.data.dataparsers.phototourism_dataparser

# Copyright 2022 the Regents of the University of California, Nerfstudio Team and contributors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Phototourism dataset parser. Datasets and documentation here: http://phototour.cs.washington.edu/datasets/"""
from __future__ import annotations

import math
from dataclasses import dataclass, field
from pathlib import Path
from typing import Literal, Type

import numpy as np
import torch

from nerfstudio.cameras import camera_utils
from nerfstudio.cameras.cameras import Cameras, CameraType
from nerfstudio.data.dataparsers.base_dataparser import DataParser, DataParserConfig, DataparserOutputs
from nerfstudio.data.scene_box import SceneBox

# TODO(1480) use pycolmap instead of colmap_parsing_utils
# import pycolmap
from nerfstudio.data.utils.colmap_parsing_utils import read_cameras_binary, read_images_binary
from nerfstudio.utils.rich_utils import CONSOLE


[docs]@dataclass class PhototourismDataParserConfig(DataParserConfig): """Phototourism dataset parser config""" _target: Type = field(default_factory=lambda: Phototourism) """target class to instantiate""" data: Path = Path("data/phototourism/brandenburg-gate") """Directory specifying location of data.""" scale_factor: float = 3.0 """How much to scale the camera origins by.""" alpha_color: str = "white" """alpha color of background""" train_split_fraction: float = 0.9 """The fraction of images to use for training. The remaining images are for eval.""" scene_scale: float = 1.0 """How much to scale the region of interest by.""" orientation_method: Literal["pca", "up", "vertical", "none"] = "up" """The method to use for orientation.""" center_method: Literal["poses", "focus", "none"] = "poses" """The method to use to center the poses.""" auto_scale_poses: bool = True """Whether to automatically scale the poses to fit in +/- 1 bounding box."""
[docs]@dataclass class Phototourism(DataParser): """Phototourism dataset. This is based on https://github.com/kwea123/nerf_pl/blob/nerfw/datasets/phototourism.py and uses colmap's utils file to read the poses. """ config: PhototourismDataParserConfig def __init__(self, config: PhototourismDataParserConfig): super().__init__(config=config) self.data: Path = config.data def _generate_dataparser_outputs(self, split="train"): image_filenames = [] poses = [] with CONSOLE.status(f"[bold green]Reading phototourism images and poses for {split} split...") as _: # TODO(1480) use pycolmap # recon = pycolmap.Reconstruction(self.data / "dense" / "sparse") # cams = recon.cameras # imgs = recon.images cams = read_cameras_binary(self.data / "dense/sparse/cameras.bin") imgs = read_images_binary(self.data / "dense/sparse/images.bin") poses = [] fxs = [] fys = [] cxs = [] cys = [] image_filenames = [] flip = torch.eye(3) flip[0, 0] = -1.0 flip = flip.double() for _id, cam in cams.items(): img = imgs[_id] assert cam.model == "PINHOLE", "Only pinhole (perspective) camera model is supported at the moment" pose = torch.cat([torch.tensor(img.qvec2rotmat()), torch.tensor(img.tvec.reshape(3, 1))], dim=1) pose = torch.cat([pose, torch.tensor([[0.0, 0.0, 0.0, 1.0]])], dim=0) poses.append(torch.linalg.inv(pose)) fxs.append(torch.tensor(cam.params[0])) fys.append(torch.tensor(cam.params[1])) cxs.append(torch.tensor(cam.params[2])) cys.append(torch.tensor(cam.params[3])) image_filenames.append(self.data / "dense/images" / img.name) poses = torch.stack(poses).float() poses[..., 1:3] *= -1 fxs = torch.stack(fxs).float() fys = torch.stack(fys).float() cxs = torch.stack(cxs).float() cys = torch.stack(cys).float() # filter image_filenames and poses based on train/eval split percentage num_images = len(image_filenames) num_train_images = math.ceil(num_images * self.config.train_split_fraction) num_eval_images = num_images - num_train_images i_all = np.arange(num_images) i_train = np.linspace( 0, num_images - 1, num_train_images, dtype=int ) # equally spaced training images starting and ending at 0 and num_images-1 i_eval = np.setdiff1d(i_all, i_train) # eval images are the remaining images i_all = torch.tensor(i_all) i_train = torch.tensor(i_train, dtype=torch.long) i_eval = torch.tensor(i_eval, dtype=torch.long) assert len(i_eval) == num_eval_images if split == "train": indices = i_train elif split in ["val", "test"]: indices = i_eval else: raise ValueError(f"Unknown dataparser split {split}") poses, transform_matrix = camera_utils.auto_orient_and_center_poses( poses, method=self.config.orientation_method, center_method=self.config.center_method ) # Scale poses scale_factor = 1.0 if self.config.auto_scale_poses: scale_factor /= float(torch.max(torch.abs(poses[:, :3, 3]))) scale_factor *= self.config.scale_factor poses[:, :3, 3] *= scale_factor # in x,y,z order # assumes that the scene is centered at the origin aabb_scale = self.config.scene_scale scene_box = SceneBox( aabb=torch.tensor( [[-aabb_scale, -aabb_scale, -aabb_scale], [aabb_scale, aabb_scale, aabb_scale]], dtype=torch.float32 ) ) cameras = Cameras( camera_to_worlds=poses[:, :3, :4], fx=fxs, fy=fys, cx=cxs, cy=cys, camera_type=CameraType.PERSPECTIVE, ) cameras = cameras[indices] image_filenames = [image_filenames[i] for i in indices] assert len(cameras) == len(image_filenames) dataparser_outputs = DataparserOutputs( image_filenames=image_filenames, cameras=cameras, scene_box=scene_box, dataparser_scale=scale_factor, dataparser_transform=transform_matrix, ) return dataparser_outputs