mirror of
https://git.roussel.pro/telecom-paris/GIN206.git
synced 2026-02-09 02:30:17 +01:00
added documentation
This commit is contained in:
59
analyze.py
59
analyze.py
@@ -2,6 +2,21 @@ import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def error(df, df_original, column_name):
|
||||
"""
|
||||
Calculate the error between the values in a column of a DataFrame and the last value before each timestamp.
|
||||
|
||||
Args:
|
||||
df (pandas.DataFrame): The DataFrame containing the values.
|
||||
df_original (pandas.DataFrame): The original DataFrame containing the timestamps and values.
|
||||
column_name (str): The name of the column to calculate the error for.
|
||||
|
||||
Returns:
|
||||
list: A list of absolute differences between the values in the specified column and the last value before each timestamp.
|
||||
|
||||
Raises:
|
||||
ValueError: If the specified column does not exist in the DataFrame.
|
||||
"""
|
||||
|
||||
diff = []
|
||||
# Check if the column exists in the DataFrame
|
||||
if column_name not in df.columns:
|
||||
@@ -23,6 +38,19 @@ def error(df, df_original, column_name):
|
||||
|
||||
|
||||
def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"):
|
||||
"""
|
||||
Plots a histogram of the given data series.
|
||||
|
||||
Parameters:
|
||||
- data_series (array-like): The data series to plot the histogram for.
|
||||
- bins (int): The number of bins to use for the histogram. Default is 10.
|
||||
- title (str): The title of the histogram plot. Default is "Distribution of Absolute Differences".
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
plt.figure(figsize=(8, 4)) # Set the figure size for better readability
|
||||
plt.hist(data_series, bins=bins, color='blue', alpha=0.7, edgecolor='black')
|
||||
plt.title(title)
|
||||
@@ -32,15 +60,40 @@ def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differe
|
||||
plt.show()
|
||||
|
||||
def compute_efficiency(df):
|
||||
#compute the time differnece between the first and last point
|
||||
"""
|
||||
Compute the efficiency of a data frame. i.e the time taken to collect each data point.
|
||||
|
||||
Parameters:
|
||||
df (pandas.DataFrame): The input data frame.
|
||||
|
||||
Returns:
|
||||
float: The efficiency value.
|
||||
|
||||
"""
|
||||
# compute the time difference between the first and last point
|
||||
time_diff = df["time"].iloc[-1] - df["time"].iloc[0]
|
||||
#compute the number of points
|
||||
# compute the number of points
|
||||
num_points = len(df)
|
||||
#compute the efficiency
|
||||
# compute the efficiency
|
||||
efficiency = time_diff.total_seconds() / num_points
|
||||
return efficiency
|
||||
|
||||
def hourly_rate_of_change(df):
|
||||
"""
|
||||
Calculate the average absolute rate of change per hour for a given DataFrame.
|
||||
|
||||
Args:
|
||||
df (pandas.DataFrame): The DataFrame containing the data.
|
||||
|
||||
Returns:
|
||||
pandas.Series: A Series containing the average absolute rate of change per hour.
|
||||
|
||||
Raises:
|
||||
ValueError: If the DataFrame does not include 'time' and 'value' columns, or if it is empty.
|
||||
ValueError: If the 'time' column is not of datetime type.
|
||||
|
||||
"""
|
||||
|
||||
# Check if required columns exist
|
||||
if 'time' not in df.columns or 'value' not in df.columns:
|
||||
raise ValueError("DataFrame must include 'time' and 'value' columns.")
|
||||
|
||||
Reference in New Issue
Block a user