Source code for anchor_python_visualization.embeddings.features

"""Loading and labelling embeddings."""

__author__ = "Owen Feehan"
__copyright__ = "Copyright (C) 2021 Owen Feehan"
__license__ = "MIT"
__version__ = "0.1"

import argparse
import os
from typing import Optional

import numpy as np
import pandas as pd

from ._identifiers import select_or_create_identifiers
from ._labels import labels_from_identifiers
from .label import LabelledFeatures

[docs]COLUMN_NAME_IDENTIFIER: str = "identifier"
"""Name for index column."""

[docs]PLACEHOLDER_FOR_SUBSTITUTION: str = "<IMAGE>"
"""Optional placeholder used in image_dir argument."""


[docs]def load_features(args: argparse.Namespace) -> LabelledFeatures:
    """Loads the embeddings from a CSV file and determines identifiers and labels.

     This determination occurs according to command-line arguments.

    Args:
        args: the command-line arguments.

    Returns:
        newly-created instance of features after having being loaded.
    """

    # Read all columns, text and number
    features = _read_csv(args.file_path_to_csv, encoding=args.encoding)

    # Find the numeric and string columns
    numeric_columns = features.select_dtypes(include=np.number)
    string_columns = features.select_dtypes(include=["object"])

    # Extract or create identifiers for the data-frame
    identifiers = select_or_create_identifiers(string_columns, numeric_columns)

    features_with_identifiers = _add_row_names(numeric_columns.copy(), identifiers)

    # Take the first string col as the row names (index)
    return LabelledFeatures(
        features_with_identifiers,
        _derive_group_label_from_identifiers(
            features_with_identifiers, args.max_label_index
        ),
        _maybe_image_paths(
            features_with_identifiers, args.image_path, args.image_sequence
        ),
    )


def _read_csv(file_path_to_csv: str, encoding: str) -> pd.DataFrame:
    """Reads the CSV from the file-system with a particular encoding."""
    return pd.read_csv(file_path_to_csv, index_col=None, header=0, encoding=encoding)


def _maybe_image_paths(
    features: pd.DataFrame,
    image_directory_path: Optional[str],
    image_directory_sequence: Optional[str],
) -> Optional[pd.Series]:
    """Maybe creates a series of image-paths derived from the index names in data/frame.

    No paths are created if image_directory_path is None, and instead None is returned.

    Args:
      features: data-frame the images refer to.
      image_directory_path: iff present, the index name of data-frame (a relative path) for each
        feature row is appended/substituted to form a complete path to an image.
      image_directory_sequence: iff present, a six-digit integer sequence for each feature row is
        appended/substituted to form a complete path to an image.

    Returns:
        a series with an identical number of rows in identical order, or None.
    """
    # If neither image_dir argument is set exit
    if (image_directory_path is None) and (image_directory_sequence is None):
        return None

    # If image_dir_path is set, form complete image-paths for each feature-row by using the path
    # (the label in the index) of the data frame to join or substitute
    if image_directory_path:
        return features.index.to_series().map(
            lambda path: _join_or_substitute(image_directory_path, path)
        )

    # If image_dir_sequence is set, form complete image-paths for each feature-row using a six digit
    # sequence to join or substitute
    if image_directory_sequence:
        number_rows = len(features.index)
        sequence = pd.Series(range(0, number_rows))
        return sequence.map(
            lambda number: _join_or_substitute(
                image_directory_sequence, "{:06d}".format(number)
            )
        )


def _join_or_substitute(image_directory: str, path: str) -> str:
    """Derives paths to images by either joining path to image_dir or substituting path into it.

    The sibustition occurs if the path contains a sub-string `PLACEHOLDER_FOR_SUBSTITUTION`.

    Both paths are normed so that directory-seperators match the execution environment.

    Args:
        image_directory: either the absolute path to a directory OR a such a path with a placeholder
            :code:`PLACEHOLDER_FOR_SUBSTITUTION` which can be substituted.
        path: the relative-path to an image.

    Returns:
        either the relative-path joined to image_dir or the relative-path substituted into image_dir
            in place of :code:`PLACEHOLDER_FOR_SUBSTITUTION`.

    """
    if PLACEHOLDER_FOR_SUBSTITUTION in image_directory:
        return os.path.normpath(image_directory).replace(
            PLACEHOLDER_FOR_SUBSTITUTION, os.path.normpath(path), 1
        )
    else:
        return os.path.join(image_directory, path)


def _add_row_names(features: pd.DataFrame, row_names: pd.Series) -> pd.DataFrame:
    """Adds a series as row-names to a data-frame."""
    features[COLUMN_NAME_IDENTIFIER] = row_names
    features.set_index(COLUMN_NAME_IDENTIFIER, inplace=True)
    return features


def _derive_group_label_from_identifiers(
    features: pd.DataFrame, max_label_index: int
) -> pd.Series:
    """Derives the first group (leftmost group in name) from the names of a data-frame."""
    return pd.Series(
        list(labels_from_identifiers(features.index.values, max_label_index)),
        dtype="category",
        index=features.index,
    )