Source code for src.dataset_creation.statistics

"""
This module contains useful classes used to obtain metadata from the dataset generation processes.
"""


from pydantic import BaseModel
from typing import Literal, Optional
from pathlib import Path
import numpy as np
import pickle
import collections
import matplotlib.pyplot as plt


[docs]
class Metadata(BaseModel):  # numpydoc ignore=PR01
    """
    Utility class to keep the sample metadata information
    """

    seed: int

    track_type: Literal["PSS", "AC_rail", "subgrade"]
    
    ballast_simulation_seed: Optional[int]
    fouling_level: float
    is_fouled: bool
    general_deterioration: float
    
    layer_sizes: dict[str, float]

    general_water_content: float
    water_infiltrations: tuple[float, float, float]
    layer_water_ranges: tuple[tuple[float, float], tuple[float, float], tuple[float, float]]

    sleepers_material : Literal["wood", "steel", "concrete"]
    sleeper_positions: list[tuple[float, float, float]]

    # calculated after the sampling, based on general deterioration 
    # and layer water ranges
    fouling_material: Optional[tuple[float, float]] = None
    pss_material: Optional[tuple[float, float]] = None
    subsoil_material: Optional[tuple[float, float]] = None




[docs]
class DatasetStats():
    """
    Class used to store and save metadata from multiple runs to file.

    Parameters
    ----------
    stats : dict[str, Metadata]
        dictionary containing statistics relative to input files.
    """
    def __init__(self, stats: dict[str, Metadata]) -> None:
        self.stats = stats


[docs]
    def write_metadata_files(self, directory: Path):
        """
        Writes on disk the dataset statistics.

        Parameters
        ----------
        directory : dict[str]
            Directory in which to save the dataset statistics.
        """
        # write a file for each sample
        directory.mkdir(exist_ok=True)
        for file, metadata in self.stats.items():
            info_file = (directory / file).with_suffix(".txt")
            with open(info_file, "w") as f:
                for k, j in metadata:
                    f.write(f"{k}: {j}\n")

        # write a single pkl file with all the informations
        with open(directory / "all_data.pkl", "wb") as f:
            pickle.dump(self, f)
        
        # calculate meaningful statistics for the dataset:
        track_type = {
            "PSS": 0,
            "AC_rail": 0,
            "subgrade": 0
        } # counts
        is_fouled = []      # percentage
        fouling_level = []  # distribution
        layer_sizes = {"fouling" : [], "ballast" : [], "asphalt" : [], "PSS" : []}    # distribution
        general_water_content = []  # distribution
        water_infiltrations = []    # percentage for each layer
        sleepers_material_counts = {"wood" : 0, "steel" : 0, "concrete": 0}      # percentage for each material
        water_contents = {"fouling": [], "PSS": [], "subsoil": []} # distribution for min and max for each layer
        sleeper_counts = [] # distribution 2 or 3 sleepers

        for file, metadata in self.stats.items():
            track_type[metadata.track_type] += 1
            is_fouled.append(metadata.is_fouled)
            fouling_level.append(metadata.fouling_level)
            for name, size in metadata.layer_sizes.items():
                layer_sizes[name].append(size)
            general_water_content.append(metadata.general_water_content)
            water_infiltrations.append(metadata.water_infiltrations)
            sleepers_material_counts[metadata.sleepers_material] += 1
            if metadata.is_fouled:
                water_contents["fouling"].append(metadata.fouling_material[4:])
            water_contents["PSS"].append(metadata.pss_material[4:])
            water_contents["subsoil"].append(metadata.subsoil_material[4:])
            sleeper_counts.append(len(metadata.sleeper_positions))
        
        is_fouled_percentage = np.array(is_fouled).mean()
        fouling_level = np.array(fouling_level)
        layer_sizes_distrib = {}
        for name, l in layer_sizes.items():
            layer_sizes_distrib[name] = np.array(l)
        general_water_content_distrib = np.array(general_water_content)
        water_infiltrations_percentages = np.array(water_infiltrations).mean(axis=0)
        layers_water_content_distrib = {}
        for name, data in water_contents.items():
            d = np.array(data)
            if d.ndim == 1:
                d = d[None, :]
            layers_water_content_distrib[name] = d
        sleepers_counts_distrib = collections.Counter(sleeper_counts)

        # write statistics
        with open(directory / "statistics.txt", "w") as f:
            f.write(f"track types: {track_type}\n")
            f.write(f"fouled percentage: {is_fouled_percentage}\n")
            f.write(f"water infiltrations percentages: {water_infiltrations_percentages}\n")
            f.write(f"sleeper number distribution: {sleepers_counts_distrib}\n")
            f.write(f"sleeper material distribution: {sleepers_material_counts}\n")

        # plot distributions
        plots_dir = directory / "plots"
        plots_dir.mkdir(exist_ok=True)

        fig, ax = plt.subplots()
        fig.suptitle("Fouling level")
        ax.hist(fouling_level, bins = 30)
        fig.savefig(plots_dir / "fouling_level.png")

        # layer sizes
        fig, axs = plt.subplots(ncols=4, sharey=True, tight_layout=True)
        fig.suptitle("layer sizes")
        axs[0].hist(layer_sizes_distrib["fouling"], bins = 30)
        axs[0].set_title("fouling")
        axs[1].hist(layer_sizes_distrib["ballast"], bins = 30)
        axs[1].set_title("ballast")
        axs[2].hist(layer_sizes_distrib["asphalt"], bins = 30)
        axs[2].set_title("asphalt")
        axs[3].hist(layer_sizes_distrib["PSS"], bins = 30)
        axs[3].set_title("PSS")
        fig.savefig(plots_dir / "layer_sizes.png")

        # general water content
        fig, ax = plt.subplots()
        fig.suptitle("General water content distribution")
        ax.hist(general_water_content_distrib, bins = 30)
        fig.savefig(plots_dir / "general_water_content.png")
        
        # layers water ranges
        fig, axs = plt.subplots(ncols=3, sharex=True, sharey=True, tight_layout = True)
        fig.suptitle("water content distributions")
        if is_fouled_percentage > 0.0:
            axs[0].hist2d(layers_water_content_distrib["fouling"][:, 0], layers_water_content_distrib["fouling"][:, 1], bins=30)
        axs[0].set_title("fouling")
        axs[1].hist2d(layers_water_content_distrib["PSS"][:, 0], layers_water_content_distrib["PSS"][:, 1], bins=30)
        axs[1].set_title("PSS")
        axs[2].hist2d(layers_water_content_distrib["subsoil"][:, 0], layers_water_content_distrib["subsoil"][:, 1], bins=30)
        axs[2].set_title("subsoil")
        fig.savefig(plots_dir / "water_content.png")
        plt.close("all")