Source code for src.dataset_creation.statistics
"""
This module contains useful classes used to obtain metadata from the dataset generation processes.
"""
from pydantic import BaseModel
from typing import Literal, Optional
from pathlib import Path
import numpy as np
import pickle
import collections
import matplotlib.pyplot as plt
[docs]
class Metadata(BaseModel): # numpydoc ignore=PR01
"""
Utility class to keep the sample metadata information
"""
seed: int
track_type: Literal["PSS", "AC_rail", "subgrade"]
ballast_simulation_seed: Optional[int]
fouling_level: float
is_fouled: bool
general_deterioration: float
layer_sizes: dict[str, float]
general_water_content: float
water_infiltrations: tuple[float, float, float]
layer_water_ranges: tuple[tuple[float, float], tuple[float, float], tuple[float, float]]
sleepers_material : Literal["wood", "steel", "concrete"]
sleeper_positions: list[tuple[float, float, float]]
# calculated after the sampling, based on general deterioration
# and layer water ranges
fouling_material: Optional[tuple[float, float]] = None
pss_material: Optional[tuple[float, float]] = None
subsoil_material: Optional[tuple[float, float]] = None
[docs]
class DatasetStats():
"""
Class used to store and save metadata from multiple runs to file.
Parameters
----------
stats : dict[str, Metadata]
dictionary containing statistics relative to input files.
"""
def __init__(self, stats: dict[str, Metadata]) -> None:
self.stats = stats
[docs]
def write_metadata_files(self, directory: Path):
"""
Writes on disk the dataset statistics.
Parameters
----------
directory : dict[str]
Directory in which to save the dataset statistics.
"""
# write a file for each sample
directory.mkdir(exist_ok=True)
for file, metadata in self.stats.items():
info_file = (directory / file).with_suffix(".txt")
with open(info_file, "w") as f:
for k, j in metadata:
f.write(f"{k}: {j}\n")
# write a single pkl file with all the informations
with open(directory / "all_data.pkl", "wb") as f:
pickle.dump(self, f)
# calculate meaningful statistics for the dataset:
track_type = {
"PSS": 0,
"AC_rail": 0,
"subgrade": 0
} # counts
is_fouled = [] # percentage
fouling_level = [] # distribution
layer_sizes = {"fouling" : [], "ballast" : [], "asphalt" : [], "PSS" : []} # distribution
general_water_content = [] # distribution
water_infiltrations = [] # percentage for each layer
sleepers_material_counts = {"wood" : 0, "steel" : 0, "concrete": 0} # percentage for each material
water_contents = {"fouling": [], "PSS": [], "subsoil": []} # distribution for min and max for each layer
sleeper_counts = [] # distribution 2 or 3 sleepers
for file, metadata in self.stats.items():
track_type[metadata.track_type] += 1
is_fouled.append(metadata.is_fouled)
fouling_level.append(metadata.fouling_level)
for name, size in metadata.layer_sizes.items():
layer_sizes[name].append(size)
general_water_content.append(metadata.general_water_content)
water_infiltrations.append(metadata.water_infiltrations)
sleepers_material_counts[metadata.sleepers_material] += 1
if metadata.is_fouled:
water_contents["fouling"].append(metadata.fouling_material[4:])
water_contents["PSS"].append(metadata.pss_material[4:])
water_contents["subsoil"].append(metadata.subsoil_material[4:])
sleeper_counts.append(len(metadata.sleeper_positions))
is_fouled_percentage = np.array(is_fouled).mean()
fouling_level = np.array(fouling_level)
layer_sizes_distrib = {}
for name, l in layer_sizes.items():
layer_sizes_distrib[name] = np.array(l)
general_water_content_distrib = np.array(general_water_content)
water_infiltrations_percentages = np.array(water_infiltrations).mean(axis=0)
layers_water_content_distrib = {}
for name, data in water_contents.items():
d = np.array(data)
if d.ndim == 1:
d = d[None, :]
layers_water_content_distrib[name] = d
sleepers_counts_distrib = collections.Counter(sleeper_counts)
# write statistics
with open(directory / "statistics.txt", "w") as f:
f.write(f"track types: {track_type}\n")
f.write(f"fouled percentage: {is_fouled_percentage}\n")
f.write(f"water infiltrations percentages: {water_infiltrations_percentages}\n")
f.write(f"sleeper number distribution: {sleepers_counts_distrib}\n")
f.write(f"sleeper material distribution: {sleepers_material_counts}\n")
# plot distributions
plots_dir = directory / "plots"
plots_dir.mkdir(exist_ok=True)
fig, ax = plt.subplots()
fig.suptitle("Fouling level")
ax.hist(fouling_level, bins = 30)
fig.savefig(plots_dir / "fouling_level.png")
# layer sizes
fig, axs = plt.subplots(ncols=4, sharey=True, tight_layout=True)
fig.suptitle("layer sizes")
axs[0].hist(layer_sizes_distrib["fouling"], bins = 30)
axs[0].set_title("fouling")
axs[1].hist(layer_sizes_distrib["ballast"], bins = 30)
axs[1].set_title("ballast")
axs[2].hist(layer_sizes_distrib["asphalt"], bins = 30)
axs[2].set_title("asphalt")
axs[3].hist(layer_sizes_distrib["PSS"], bins = 30)
axs[3].set_title("PSS")
fig.savefig(plots_dir / "layer_sizes.png")
# general water content
fig, ax = plt.subplots()
fig.suptitle("General water content distribution")
ax.hist(general_water_content_distrib, bins = 30)
fig.savefig(plots_dir / "general_water_content.png")
# layers water ranges
fig, axs = plt.subplots(ncols=3, sharex=True, sharey=True, tight_layout = True)
fig.suptitle("water content distributions")
if is_fouled_percentage > 0.0:
axs[0].hist2d(layers_water_content_distrib["fouling"][:, 0], layers_water_content_distrib["fouling"][:, 1], bins=30)
axs[0].set_title("fouling")
axs[1].hist2d(layers_water_content_distrib["PSS"][:, 0], layers_water_content_distrib["PSS"][:, 1], bins=30)
axs[1].set_title("PSS")
axs[2].hist2d(layers_water_content_distrib["subsoil"][:, 0], layers_water_content_distrib["subsoil"][:, 1], bins=30)
axs[2].set_title("subsoil")
fig.savefig(plots_dir / "water_content.png")
plt.close("all")