From 628f0439b7411fc6956c49be73e81a4eccbec114 Mon Sep 17 00:00:00 2001
From: Artur Mukhamadiev <muhamadiev1@gmail.com>
Date: Sun, 12 Oct 2025 23:37:07 +0300
Subject: [PATCH] [analysis] mean values for non-unique config runs

---
 .gitignore         |   4 ++
 latencyAnalysis.py |  93 ++++++++++++++++++++--------
 qualityAnalysis.py | 147 ++++++++++++++++++++++++++++++---------------
 3 files changed, 172 insertions(+), 72 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0dbf2f2..431a1c8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
+#project ignore:
+plots/
+results/
+
 # ---> Python
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/latencyAnalysis.py b/latencyAnalysis.py
index ef5a7fa..7d1741e 100644
--- a/latencyAnalysis.py
+++ b/latencyAnalysis.py
@@ -2,20 +2,23 @@ import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
 import os
+import re
 import argparse
 import logging
 
 # Configure logging to show informational messages
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+
 
 def parse_args():
     parser = argparse.ArgumentParser(prog=__file__)
     parser.add_argument('-c', '--compensate', action="store_true")
-    parser.add_argument('--latency-csv', 
-                        type=str, 
+    parser.add_argument('--latency-csv',
+                        type=str,
                         default='sample/latencyDataframenvh264enc.csv',
                         help='Path to the latency results CSV file.')
-    parser.add_argument('-pd','--plot-dir', 
+    parser.add_argument('-pd', '--plot-dir',
                         type=str,
                         default='plots/',
                         help='Path to directory in which resulted plots should be saved')
@@ -25,13 +28,17 @@ def parse_args():
                         help='Path to directory in which resulted csv data should be saved')
     return parser.parse_args()
 
+
 cmd_args = None
+
+
 def get_args():
     global cmd_args
     if cmd_args is None:
         cmd_args = parse_args()
     return cmd_args
 
+
 def plot_latency_data(df):
     def create_labels(df):
         """Combines MultiIndex levels (L0-L3) into a single string for notes."""
@@ -40,7 +47,7 @@ def plot_latency_data(df):
             # Format: L#:value | L#:value | ...
             label_parts = [f"L{j}:{val}" for j, val in enumerate(index)]
             labels[i + 1] = " | ".join(label_parts)
-        return labels        
+        return labels
 
     df = df.head(10)
     encoder_name = df.index.get_level_values(0)[0]
@@ -53,20 +60,26 @@ def plot_latency_data(df):
     r3 = [x + bar_width for x in r2]
     fig = plt.figure(figsize=(10, 6), dpi=300)
     # Create the bars
-    plt.bar(r1, df['max'], color='red', width=bar_width, edgecolor='grey', label='Max Latency')
-    plt.bar(r2, df['avg'], color='blue', width=bar_width, edgecolor='grey', label='Avg Latency')
-    plt.bar(r3, df['median'], color='green', width=bar_width, edgecolor='grey', label='Median Latency')
+    plt.bar(r1, df['max'], color='red', width=bar_width,
+            edgecolor='grey', label='Max Latency')
+    plt.bar(r2, df['avg'], color='blue', width=bar_width,
+            edgecolor='grey', label='Avg Latency')
+    plt.bar(r3, df['median'], color='green', width=bar_width,
+            edgecolor='grey', label='Median Latency')
 
     # Add labels and ticks
     plt.xlabel('Индекс конфигурации', fontweight='bold')
     plt.ylabel('Общая задержка [мс]', fontweight='bold')
-    plt.xticks([r + bar_width for r in range(num_configs)], [str(i + 1) for i in range(num_configs)])
-    plt.title(f'Сравнение производительности {num_configs} лучших конфигураций по задержке для {encoder_name}')
+    plt.xticks([r + bar_width for r in range(num_configs)],
+               [str(i + 1) for i in range(num_configs)])
+    plt.title(
+        f'Сравнение производительности {num_configs} лучших конфигураций по задержке для {encoder_name}')
     plt.legend()
     plt.grid(axis='y', linestyle='--', alpha=0.6)
 
     plt.tight_layout()
-    plt.savefig(get_args().plot_dir + f'combined_top_configurations_plot_{encoder_name}.png')
+    plt.savefig(get_args().plot_dir +
+                f'combined_top_configurations_plot_{encoder_name}.png')
     plt.close()
 
     # Output Notes (for user interpretation)
@@ -74,6 +87,7 @@ def plot_latency_data(df):
     for index, note in max_notes.items():
         print(f"Index {index}: {note}")
 
+
 def plot_start_latency(df):
     fig = plt.figure(figsize=(10, 6), dpi=300)
     r1 = np.arange(len(df))
@@ -86,6 +100,7 @@ def plot_start_latency(df):
     plt.savefig(get_args().plot_dir + f"start_latency_{encoder_name}.png")
     plt.close()
 
+
 def analyze_latency_data(csv_path: str):
     """
     Analyzes latency data to find the top 10 components (rows) contributing most
@@ -96,8 +111,9 @@ def analyze_latency_data(csv_path: str):
     """
     # --- 1. Load Data with Multi-level Headers ---
     try:
-        df = pd.read_csv(csv_path, header=[0,1, 2, 3, 4], index_col=0)
-        logging.info(f"Successfully loaded '{csv_path}' with multi-level headers. Shape: {df.shape}")
+        df = pd.read_csv(csv_path, header=[0, 1, 2, 3, 4], index_col=0)
+        logging.info(
+            f"Successfully loaded '{csv_path}' with multi-level headers. Shape: {df.shape}")
         if df.index.name == 'Unnamed: 0':
             df.index.name = 'component'
     except FileNotFoundError:
@@ -107,22 +123,50 @@ def analyze_latency_data(csv_path: str):
         logging.error(f"An error occurred while reading the CSV file: {e}")
         return
 
-    #calculate summary along the rows
+    # calculate summary along the rows
     sumDf = df.sum()
     if get_args().compensate == True:
-        logging.info("Filesrc latency compensation is ON")
+        logging.info("Filesrc and rawvideoparse latency compensation is ON")
         filesrcData = df.loc["filesrc0"]
+        rawvideoparseData = df.loc["rawvideoparse0"]
         sumDf -= filesrcData
-    print(sumDf.head())
-    # return
+        sumDf -= rawvideoparseData
+    logging.debug(f"\n{sumDf.head()}")
 
-    df_summary = sumDf.unstack(level=-1) # or level='Metric' if names are set
+    # calculate mean accross non-unique runs:
+    def get_base_metric(metric):
+        """Strips suffixes like '.1' or '.2' from the metric name."""
+        return re.sub(r'\.\d+$', '', str(metric))
+
+    metric_level_values = sumDf.index.get_level_values(-1)
+
+    base_metrics_key = metric_level_values.map(get_base_metric)
+
+    config_levels = list(range(sumDf.index.nlevels - 1)
+                         )  # This gives [0, 1, 2, 3]
+
+    grouping_keys = sumDf.index.droplevel(config_levels)  # type: ignore
+    grouping_keys = [
+        sumDf.index.get_level_values(i) for i in config_levels
+    ] + [base_metrics_key]
+
+    # 3. Perform Grouping and Mean Calculation
+    # This command groups all entries that share the same (Config + Base Metric),
+    # collapsing (avg, avg.1, avg.2) into a single average.
+    averaged_sumDf = sumDf.groupby(grouping_keys).mean()
+    logging.info(f"\n{averaged_sumDf.head(10)}")
+    sumDf = averaged_sumDf
+
+    df_summary = sumDf.unstack(level=-1)  # or level='Metric' if names are set
 
     # 2. Sort the resulting DataFrame by the desired metric column.
-    df_sorted_by_max = df_summary.sort_values(by='max', ascending=True)
-    df_sorted_by_avg = df_summary.sort_values(by='avg', ascending=True)
-    df_sorted_by_median = df_summary.sort_values(by='median', ascending=True)
-    
+    df_sorted_by_max = df_summary.sort_values(
+        by='max', ascending=True)  # type: ignore
+    df_sorted_by_avg = df_summary.sort_values(
+        by='avg', ascending=True)  # type: ignore
+    df_sorted_by_median = df_summary.sort_values(
+        by='median', ascending=True)  # type: ignore
+
     print("SORTED BY MAX")
     print(df_sorted_by_max)
     print("---------------")
@@ -139,7 +183,8 @@ def analyze_latency_data(csv_path: str):
 
     # 2. Find the intersection (common elements) of the three sets of indices
     # max is main index because it is commonly introduces the largest amount of latency to the stream
-    common_indices = max_indices.intersection(avg_indices).intersection(median_indices)
+    common_indices = max_indices.intersection(
+        avg_indices).intersection(median_indices)
 
     # 3. Filter the original summary DataFrame (df_summary) using the common indices
     df_common_top_performers = df_summary.loc[common_indices]
@@ -156,8 +201,8 @@ def analyze_latency_data(csv_path: str):
     top_10_df.to_csv(get_args().csv_dir + f"{encoder_name}.csv")
     return
 
+
 if __name__ == '__main__':
     os.makedirs(get_args().csv_dir, exist_ok=True)
     os.makedirs(get_args().plot_dir, exist_ok=True)
     analyze_latency_data(get_args().latency_csv)
-
diff --git a/qualityAnalysis.py b/qualityAnalysis.py
index e8d8fd1..7d89d35 100644
--- a/qualityAnalysis.py
+++ b/qualityAnalysis.py
@@ -4,19 +4,22 @@ import numpy as np
 import logging
 import argparse
 import os
+import re
 
 # Configure logging to show informational messages
-logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+
 
 def parse_args():
     parser = argparse.ArgumentParser(prog=__file__)
     parser.add_argument(
-        '--quality-csv', 
-        type=str, 
+        '--quality-csv',
+        type=str,
         default='sample/qualityResultsnvh264enc.csv',
         help='Path to the quality results CSV file.'
     )
-    parser.add_argument('-pd','--plot-dir', 
+    parser.add_argument('-pd', '--plot-dir',
                         type=str,
                         default='plots/',
                         help='Path to directory in which resulted plots should be saved')
@@ -26,21 +29,25 @@ def parse_args():
                         help='Path to directory in which resulted csv data should be saved')
     return parser.parse_args()
 
+
 cmd_args = None
+
+
 def get_args():
     global cmd_args
     if cmd_args is None:
         cmd_args = parse_args()
     return cmd_args
 
+
 def plot_top_configurations(df: pd.DataFrame, file_name: str, title: str):
     """
     Draws a bar plot comparing PSNR and SSIM for the top 10 video configurations.
-    
+
     The plot uses a primary Y-axis for PSNR and a secondary Y-axis for SSIM
     due to their different value ranges. The X-axis uses simple numerical indices,
     with detailed configuration notes printed separately below the plot.
-    
+
     Args:
         df: DataFrame containing the top configurations, must have 'PSNR' and 'SSIM' columns.
         file_name: Name of the file to which plot would be saved.
@@ -48,50 +55,50 @@ def plot_top_configurations(df: pd.DataFrame, file_name: str, title: str):
     """
     # Use the top 10 rows for plotting
     plot_df = df.head(10).copy()
-    
+
     if plot_df.empty:
         logging.warning("DataFrame is empty, cannot generate plot.")
         return
 
     # Create the index for the x-axis (0 to 9 for bar plotting)
     config_indices = np.arange(len(plot_df))
-    
+
     # 1. Create simple numerical labels for the X-axis (1 to 10)
     x_labels_simple = [str(i + 1) for i in config_indices]
-    
+
     # 2. Generate notes mapping index to configuration details (similar to the template)
     quality_notes = {}
     for i, row in plot_df.iterrows():
         # Format: Index: encoder | profile | video | parameters
         note_parts = [
-            row['encoder'], 
-            row['profile'], 
-            row['video'], 
+            row['encoder'],
+            row['profile'],
+            row['video'],
             row['parameters']
         ]
         quality_notes[len(quality_notes) + 1] = " | ".join(note_parts)
-    
+
     # 3. Setup the figure and the primary axis (ax1)
     fig, ax1 = plt.subplots(figsize=(12, 6))
-    
+
     # Define bar width and positions
     bar_width = 0.35
-    
+
     # 4. Plot PSNR on the primary axis (left)
-    bar1 = ax1.bar(config_indices - bar_width/2, plot_df['PSNR'], bar_width, 
+    bar1 = ax1.bar(config_indices - bar_width/2, plot_df['PSNR'], bar_width,
                    label='PSNR (dB)', color='Blue', edgecolor='grey')
-    ax1.set_xlabel('Configuration Index', fontsize=12) # Simplified X-label
+    ax1.set_xlabel('Configuration Index', fontsize=12)  # Simplified X-label
     ax1.set_ylabel('PSNR (dB)', color='Black', fontsize=12)
     ax1.tick_params(axis='y', labelcolor='Black')
     ax1.set_xticks(config_indices)
     # Use simple numerical labels for the X-axis
-    ax1.set_xticklabels(x_labels_simple, fontsize=10) 
-    
+    ax1.set_xticklabels(x_labels_simple, fontsize=10)
+
     # Add PSNR value labels above the bars
     for rect in bar1:
         height = rect.get_height()
         ax1.annotate(f'PSNR={height:.2f}',
-                     xy=(rect.get_x() + rect.get_width() / 2, height / 1.5 ),
+                     xy=(rect.get_x() + rect.get_width() / 2, height / 1.5),
                      xytext=(0, 0),  # 3 points vertical offset
                      textcoords="offset points", transform_rotates_text=True,
                      rotation=90,
@@ -99,18 +106,18 @@ def plot_top_configurations(df: pd.DataFrame, file_name: str, title: str):
 
     # 5. Create a secondary axis (ax2) for SSIM (twinx)
     ax2 = ax1.twinx()
-    
+
     # 6. Plot SSIM on the secondary axis (right)
-    bar2 = ax2.bar(config_indices + bar_width/2, plot_df['SSIM'], bar_width, 
+    bar2 = ax2.bar(config_indices + bar_width/2, plot_df['SSIM'], bar_width,
                    label='SSIM', color='Red', edgecolor='grey')
     ax2.set_ylabel('SSIM (Structural Similarity)', color='Black', fontsize=12)
     ax2.tick_params(axis='y', labelcolor='Black')
-    
+
     # Add SSIM value labels above the bars
     for rect in bar2:
         height = rect.get_height()
         ax2.annotate(f'SSIM={height:.4f}',
-                     xy=(rect.get_x() + rect.get_width() / 2, height / 1.5 ),
+                     xy=(rect.get_x() + rect.get_width() / 2, height / 1.5),
                      xytext=(0, 0),  # 3 points vertical offset
                      textcoords="offset points", transform_rotates_text=True,
                      rotation=90,
@@ -118,12 +125,13 @@ def plot_top_configurations(df: pd.DataFrame, file_name: str, title: str):
 
     # 7. Final Plot appearance
     fig.suptitle(title)
-    fig.tight_layout(rect=[0, 0.03, 1, 0.95]) 
-    
+    fig.tight_layout(rect={0.0, 0.03, 1.0, 0.95})  # type: ignore
+
     # Combine legends from both axes
     lines1, labels1 = ax1.get_legend_handles_labels()
     lines2, labels2 = ax2.get_legend_handles_labels()
-    ax1.legend(lines1 + lines2, labels1 + labels2, bbox_to_anchor=(0.6, 1.1), ncol=2)
+    ax1.legend(lines1 + lines2, labels1 + labels2,
+               bbox_to_anchor=(0.6, 1.1), ncol=2)
 
     plt.grid(axis='y', linestyle='--', alpha=0.7)
     plt.savefig(f'{file_name}.png')
@@ -133,11 +141,13 @@ def plot_top_configurations(df: pd.DataFrame, file_name: str, title: str):
     for index, note in quality_notes.items():
         print(f"Index {index}: {note}")
 
+
 def analyze_quality_report(csv_path: str):
     # --- 1. Load Data with Multi-level Headers ---
     try:
         df = pd.read_csv(csv_path, header=[0, 1, 2, 3, 4], index_col=0)
-        logging.info(f"Successfully loaded '{csv_path}' with multi-level headers. Shape: {df.shape}")
+        logging.info(
+            f"Successfully loaded '{csv_path}' with multi-level headers. Shape: {df.shape}")
         if df.index.name == 'Unnamed: 0':
             df.index.name = 'component'
     except FileNotFoundError:
@@ -146,16 +156,45 @@ def analyze_quality_report(csv_path: str):
     except Exception as e:
         logging.error(f"An error occurred while reading the CSV file: {e}")
         return
-    
+
     # Get row with average results
     avgDf = df.loc["Average"]
+    logging.info(f"\n{avgDf.head(10)}")
+    # calculate mean accross non-unique runs:
+
+    def get_base_metric(metric):
+        """Strips suffixes like '.1' or '.2' from the metric name."""
+        return re.sub(r'\.\d+$', '', str(metric))
+
+    metric_level_values = avgDf.index.get_level_values(-1)
+
+    base_metrics_key = metric_level_values.map(get_base_metric)
+
+    config_levels = list(range(avgDf.index.nlevels - 1)
+                         )  # This gives [0, 1, 2, 3]
+
+    grouping_keys = avgDf.index.droplevel(config_levels)  # type: ignore
+    grouping_keys = [
+        avgDf.index.get_level_values(i) for i in config_levels
+    ] + [base_metrics_key]
+
+    # 3. Perform Grouping and Mean Calculation
+    # This command groups all entries that share the same (Config + Base Metric),
+    # collapsing (avg, avg.1, avg.2) into a single average.
+    averaged_sumDf = avgDf.groupby(grouping_keys).mean()
+    logging.info(f"\n{averaged_sumDf.head(10)}")
+    avgDf = averaged_sumDf
+
+    logging.info(f"\n{avgDf.head(10)}")
+
     avgDf = avgDf.unstack(level=-1)
-    
     encoder_name = avgDf.index.get_level_values(0)[0]
     logging.debug(f"encoder_name={encoder_name}")
 
-    dfPSNRsorted = avgDf.sort_values(by="PSNR", ascending=False)
-    dfSSIMsorted = avgDf.sort_values(by="SSIM", ascending=False)
+    dfPSNRsorted = avgDf.sort_values(
+        by="PSNR", ascending=False)  # type: ignore
+    dfSSIMsorted = avgDf.sort_values(
+        by="SSIM", ascending=False)  # type: ignore
 
     indexPSNR = dfPSNRsorted.index
     indexSSIM = dfSSIMsorted.index
@@ -168,53 +207,65 @@ def analyze_quality_report(csv_path: str):
     # Convert the MultiIndex (encoder, profile, video, parameters) into columns
     df_quality_results = intersectedDf.reset_index()
     # Rename the columns to match the latency report's structure
-    df_quality_results.columns = ['encoder', 'profile', 'video', 'parameters', 'PSNR', 'SSIM']
-    logging.debug(f"Prepared quality results dataframe columns: {df_quality_results.columns.tolist()}")
-
+    df_quality_results.columns = [
+        'encoder', 'profile', 'video', 'parameters', 'PSNR', 'SSIM']
+    logging.debug(
+        f"Prepared quality results dataframe columns: {df_quality_results.columns.tolist()}")
 
     # Now intersected with latency report
     latency_df = pd.read_csv(f'results/{encoder_name}.csv')
-    columns = {'Unnamed: 0': 'encoder', 'Unnamed: 1': 'profile', 'Unnamed: 2': 'video', 'Unnamed: 3': 'parameters'}
+    columns = {'Unnamed: 0': 'encoder', 'Unnamed: 1': 'profile',
+               'Unnamed: 2': 'video', 'Unnamed: 3': 'parameters'}
     latency_df.rename(columns=columns, inplace=True)
     logging.debug(f"\n{latency_df.head()}")
 
-        # --- 4. Merge Quality and Latency Reports ---
+    # --- 4. Merge Quality and Latency Reports ---
     # Use an inner merge on the four identifier columns to combine the data.
     merge_keys = ['encoder', 'profile', 'video', 'parameters']
     merged_df = pd.merge(
         df_quality_results,
         latency_df,
         on=merge_keys,
-        how='inner' # Only keep records present in both (i.e., the top quality configurations)
+        # Only keep records present in both (i.e., the top quality configurations)
+        how='inner'
     )
 
     logging.info("=" * 70)
     logging.info("--- Intersected Quality (PSNR/SSIM) and Latency Report ---")
-    logging.info(f"Number of common configuration entries found: {len(merged_df)}")
+    logging.info(
+        f"Number of common configuration entries found: {len(merged_df)}")
     logging.info("=" * 70)
 
     # Prepare for display
     merged_df_display = merged_df.sort_values(by='PSNR', ascending=False)
-    
+
     # Select and display key metrics
     display_columns = [
-        'encoder', 'profile', 'video', 'parameters', 
-        'PSNR', 'SSIM', # Quality metrics
-        'avg', 'max', 'median', 'std' # Latency metrics (assuming these are in the latency report)
+        'encoder', 'profile', 'video', 'parameters',
+        'PSNR', 'SSIM',  # Quality metrics
+        # Latency metrics (assuming these are in the latency report)
+        'avg', 'max', 'median', 'std'
     ]
-    
-    final_cols = [col for col in display_columns if col in merged_df_display.columns]
+
+    final_cols = [
+        col for col in display_columns if col in merged_df_display.columns]
 
     print(f"\n{merged_df_display[final_cols].to_string()}")
 
-    plot_top_configurations(merged_df_display, get_args().plot_dir +  f"top_quality_configurations_by_latency_{encoder_name}", f"Результаты качества для 10 лучших конфигураций по задержкам для {encoder_name}")
+    plot_top_configurations(merged_df_display,
+                            get_args().plot_dir +
+                            f"top_quality_configurations_by_latency_{encoder_name}",
+                            f"Результаты качества для 10 лучших конфигураций по задержкам для {encoder_name}")
 
-    plot_top_configurations(df_quality_results, get_args().plot_dir + f"top_quality_configurations_{encoder_name}", f"10 лучших конфигураций по PSNR и SSIM для {encoder_name}")
+    plot_top_configurations(df_quality_results,
+                            get_args().plot_dir +
+                            f"top_quality_configurations_{encoder_name}",
+                            f"10 лучших конфигураций по PSNR и SSIM для {encoder_name}")
 
     return
 
+
 if __name__ == '__main__':
     os.makedirs(get_args().csv_dir, exist_ok=True)
     os.makedirs(get_args().plot_dir, exist_ok=True)
     analyze_quality_report(get_args().quality_csv)
-