added documentation

2026-02-09 02:30:17 +01:00 · 2024-05-07 00:36:41 +02:00
parent ac4d95bd07
commit 3fd37213e1
4 changed files with 322 additions and 24 deletions
--- a/analyze.py
+++ b/analyze.py
@@ -2,6 +2,21 @@ import pandas as pd
 import matplotlib.pyplot as plt
 def error(df, df_original, column_name):
    """
    Calculate the error between the values in a column of a DataFrame and the last value before each timestamp.
    Args:
        df (pandas.DataFrame): The DataFrame containing the values.
        df_original (pandas.DataFrame): The original DataFrame containing the timestamps and values.
        column_name (str): The name of the column to calculate the error for.
    Returns:
        list: A list of absolute differences between the values in the specified column and the last value before each timestamp.
    Raises:
        ValueError: If the specified column does not exist in the DataFrame.
    """
    diff = []
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
@@ -23,6 +38,19 @@ def error(df, df_original, column_name):
 def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"):
    """
    Plots a histogram of the given data series.
    Parameters:
    - data_series (array-like): The data series to plot the histogram for.
    - bins (int): The number of bins to use for the histogram. Default is 10.
    - title (str): The title of the histogram plot. Default is "Distribution of Absolute Differences".
    Returns:
    None
    """
    import matplotlib.pyplot as plt
    plt.figure(figsize=(8, 4))  # Set the figure size for better readability
    plt.hist(data_series, bins=bins, color='blue', alpha=0.7, edgecolor='black')
    plt.title(title)
@@ -32,15 +60,40 @@ def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differe
    plt.show()
 def compute_efficiency(df):
-    #compute the time differnece between the first and last point
+    """
    Compute the efficiency of a data frame. i.e the time taken to collect each data point.
    Parameters:
    df (pandas.DataFrame): The input data frame.
    Returns:
    float: The efficiency value.
    """
    # compute the time difference between the first and last point
    time_diff = df["time"].iloc[-1] - df["time"].iloc[0]
-    #compute the number of points
+    # compute the number of points
    num_points = len(df)
-    #compute the efficiency
+    # compute the efficiency
    efficiency = time_diff.total_seconds() / num_points
    return efficiency
 def hourly_rate_of_change(df):
    """
    Calculate the average absolute rate of change per hour for a given DataFrame.
    Args:
        df (pandas.DataFrame): The DataFrame containing the data.
    Returns:
        pandas.Series: A Series containing the average absolute rate of change per hour.
    Raises:
        ValueError: If the DataFrame does not include 'time' and 'value' columns, or if it is empty.
        ValueError: If the 'time' column is not of datetime type.
    """
    # Check if required columns exist
    if 'time' not in df.columns or 'value' not in df.columns:
        raise ValueError("DataFrame must include 'time' and 'value' columns.")
--- a/generate_data.py
+++ b/generate_data.py
@@ -4,6 +4,19 @@ from opensimplex import OpenSimplex
 import datetime
 def generate_greenhouse_data(filepath):
    """
    Generate filtered greenhouse data from a CSV file.
    Parameters:
    filepath (str): The path to the CSV file.
    Returns:
    pandas.DataFrame: The filtered greenhouse data.
    """
    # Rest of the code...
 def generate_greenhouse_data(filepath):
    # Read the CSV file into a DataFrame, parsing 'time' as datetime
    df = pd.read_csv(filepath, parse_dates=["time"], dtype={"id": str, "value": float})
@@ -25,6 +38,21 @@ def generate_greenhouse_data(filepath):
 def generate_simplex(start_time=None, end_time=None, interval=600, max_temp=30, min_temp=10, frequency=10):
    """
    Generate a DataFrame with time and temperature values using Simplex noise.
    Parameters:
    - start_time (datetime): The start time for generating the data. If not provided, it defaults to 1 day before the end_time.
    - end_time (datetime): The end time for generating the data. If not provided, it defaults to the current time.
    - interval (int): The time interval in seconds between each data point. Defaults to 600 seconds (10 minutes).
    - max_temp (float): The maximum temperature value. Defaults to 30.
    - min_temp (float): The minimum temperature value. Defaults to 10.
    - frequency (int): The frequency parameter for the Simplex noise generator. Defaults to 10.
    Returns:
    - df (DataFrame): A pandas DataFrame with 'time' and 'value' columns representing the generated time and temperature values.
    """
    # Default time settings if none provided
    if end_time is None:
        end_time = datetime.datetime.now()
--- a/main.py
+++ b/main.py
@@ -4,11 +4,37 @@ from generate_data import *
 from analyze import *
 from poll import *
-# sort two lists based on the first list
+def sort(X, Y):
-def sort(X,Y):
+    """
-    return zip(*sorted(zip(X,Y)))
+    Sorts two lists X and Y in ascending order based on the values in X.
    Args:
        X (list): The first list to be sorted.
        Y (list): The second list to be sorted.
    Returns:
        tuple: A tuple containing the sorted X and Y lists.
    Example:
        X = [3, 1, 2]
        Y = ['c', 'a', 'b']
        sorted_X, sorted_Y = sort(X, Y)
        # sorted_X: [1, 2, 3]
        # sorted_Y: ['a', 'b', 'c']
    """
    return zip(*sorted(zip(X, Y)))
 def plot_temperature_data(df, recent_count=None):
    """
    Plots the temperature data from a DataFrame.
    Args:
        df (pandas.DataFrame): The DataFrame containing the temperature data.
        recent_count (int, optional): The number of recent data points to plot. If specified, only the last 'recent_count' rows will be plotted. Defaults to None.
    Returns:
        None
    """
    plt.figure(figsize=(5, 5))
    # Check if recent_count is specified and valid
@@ -25,6 +51,20 @@ def plot_temperature_data(df, recent_count=None):
    plt.show()
 def test_sample_every_kth_point(df):
    """
    Test the sample_every_kth_point function with different values of k.
    Parameters:
    - df: The input DataFrame containing the data.
    Returns:
    - X: The array of values used for sampling.
    - EFFICIENCY: The efficiency values for each sampling.
    - MEAN: The mean error values for each sampling.
    - MEDIAN: The median error values for each sampling.
    - STD: The standard deviation of error values for each sampling.
    """
    X = np.arange(1, 10, 1)
    MEAN = []
    STD = []
@@ -46,24 +86,59 @@ def test_sample_every_kth_point(df):
 def example_sample_every_kth_point(k=10):
    """
    Example function that demonstrates how to sample every kth point from a dataframe and plot the temperature data.
    Parameters:
    k (int): The sampling interval. Default is 10.
    Returns:
    None
    """
    df  = generate_greenhouse_data("datasets/greenhouse.csv")
    df = df.tail(150)
    df = sample_every_kth_point(df, k)
    plot_temperature_data(df)
 def example_sample_reglin():
    """
    This function demonstrates the usage of the sample_reglin function.
    It generates greenhouse data, selects the last 150 rows, applies the sample_reglin function,
    and plots the temperature data.
    """
    df  = generate_greenhouse_data("datasets/greenhouse.csv")
    df = df.tail(150)
    df = sample_reglin(df)
    plot_temperature_data(df)
-def exaample_optimal_sample(dT = 0.3):
+def example_optimal_sample(dT = 0.3):
    """
    Example function that demonstrates the usage of the optimal_sample function.
    Parameters:
        dT (float): The threshold value for temperature difference. Default is 0.3.
    Returns:
        None
    """
    df  = generate_greenhouse_data("datasets/greenhouse.csv")
    df = df.tail(150)
    df = optimal_sample(df, threshold_dT=dT)
    plot_temperature_data(df)
 def example_sample_avg_rate_of_change():
    """
    This function demonstrates how to calculate the sample average rate of change for temperature data.
    It generates greenhouse data, calculates the hourly rate of change, selects the last 150 records,
    and then calculates the sample average rate of change based on the hourly rate of change.
    Finally, it plots the temperature data.
    Parameters:
    None
    Returns:
    None
    """
    df  = generate_greenhouse_data("datasets/greenhouse.csv")
    hroc = hourly_rate_of_change(df)
    df = df.tail(150)
@@ -71,6 +146,25 @@ def example_sample_avg_rate_of_change():
    plot_temperature_data(df)
 def test_sample_reglin(df):
    """
    Perform a test on the sample_reglin function with different values of max_dT.
    Parameters:
    - df: DataFrame
        The input DataFrame containing temperature data.
    Returns:
    - X: ndarray
        An array of values ranging from 0.4 to 3 with a step of 0.05.
    - EFFICIENCY: list
        A list of efficiency values calculated for each max_dT value.
    - MEAN: list
        A list of mean error values calculated for each max_dT value.
    - MEDIAN: list
        A list of median error values calculated for each max_dT value.
    - STD: list
        A list of standard deviation error values calculated for each max_dT value.
    """
    X = np.arange(0.4, 3, 0.05)
    MEAN = []
    STD = []
@@ -91,6 +185,20 @@ def test_sample_reglin(df):
 def test_optimal_sample(df):
    """
    Test the optimal sample function with different threshold values.
    Args:
        df (pandas.DataFrame): The input DataFrame containing temperature data.
    Returns:
        tuple: A tuple containing the following lists:
            - X (numpy.ndarray): An array of threshold values.
            - EFFICIENCY (list): A list of efficiency values for each threshold.
            - MEAN (list): A list of mean error values for each threshold.
            - MEDIAN (list): A list of median error values for each threshold.
            - STD (list): A list of standard deviation error values for each threshold.
    """
    X = np.arange(0.1, 3, 0.05)
    MEAN = []
    STD = []
@@ -109,7 +217,24 @@ def test_optimal_sample(df):
        EFFICIENCY.append(compute_efficiency(df_sampeld))
    return X, EFFICIENCY, MEAN, MEDIAN, STD
-def test_sample_avg_rate_of_change(df,hourly_rate_of_change):
+def test_sample_avg_rate_of_change(df, hourly_rate_of_change):
    """
    Test the sample average rate of change.
    This function takes a DataFrame `df` and the `hourly_rate_of_change` as input.
    It performs a series of calculations on the data and returns the results.
    Parameters:
    - df (pandas.DataFrame): The input DataFrame containing the data.
    - hourly_rate_of_change (float): The hourly rate of change.
    Returns:
    - X (numpy.ndarray): An array of values ranging from 0.01 to 3 with a step of 0.05.
    - EFFICIENCY (list): A list of efficiency values calculated for each sample.
    - MEAN (list): A list of mean values calculated for each sample.
    - MEDIAN (list): A list of median values calculated for each sample.
    - STD (list): A list of standard deviation values calculated for each sample.
    """
    X = np.arange(0.01, 3, 0.05)
    MEAN = []
    STD = []
@@ -128,35 +253,56 @@ def test_sample_avg_rate_of_change(df,hourly_rate_of_change):
        EFFICIENCY.append(compute_efficiency(df_sampled))
    return X, EFFICIENCY, MEAN, MEDIAN, STD
-def comparaison_mean(df,limit=1000):
+def comparaison_mean(df, limit=1000):
    """
    Compare different sampling methods based on their mean and efficiency.
    Parameters:
    - df: DataFrame
        The input DataFrame containing the data.
    - limit: int, optional
        The number of rows to consider from the end of the DataFrame. Default is 1000.
    Returns:
    None
    """
    plt.figure(figsize=(10, 5))
    hroc = hourly_rate_of_change(df)
    df = df.tail(limit)
    X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_every_kth_point(df)
    MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
-    plt.plot( MEAN,EFFICIENCY, label="Constant Polling Interval", marker='x')
+    plt.plot(MEAN, EFFICIENCY, label="Constant Polling Interval", marker='x')
    X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_reglin(df)
    MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
-    plt.plot( MEAN,EFFICIENCY, label="Linear Regression", marker='x')
+    plt.plot(MEAN, EFFICIENCY, label="Linear Regression", marker='x')
    X, EFFICIENCY, MEAN, MEDIAN, STD = test_optimal_sample(df)
    MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
-    plt.plot( MEAN,EFFICIENCY, label="Optimal Polling rate", marker='x')
+    plt.plot(MEAN, EFFICIENCY, label="Optimal Polling rate", marker='x')
-    X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_avg_rate_of_change(df,hroc)
+    X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_avg_rate_of_change(df, hroc)
    MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
-    plt.plot( MEAN,EFFICIENCY, label="Hourly Rate of Change", marker='x')
+    plt.plot(MEAN, EFFICIENCY, label="Hourly Rate of Change", marker='x')
    plt.ylabel("Average seconds between polls")
    plt.xlabel("Average error")
    plt.ylim(0, 8000)
-    plt.xlim(0,1.3)
+    plt.xlim(0, 1.3)
    plt.legend()
    plt.show()
 def example_optimal_sample(dT = 0.3):
    """
    This function demonstrates how to use the `optimal_sample` function to generate an optimal sample of greenhouse data.
    Parameters:
        dT (float): The threshold value for temperature difference. Default is 0.3.
    Returns:
        None
    """
    df  = generate_greenhouse_data("datasets/greenhouse.csv")
    df = df.tail(1000)
    df = optimal_sample(df, threshold_dT=dT)
@@ -169,6 +315,15 @@ def example_optimal_sample(dT = 0.3):
    plt.show()
 def histogram_sample_every_kth_point(k=10):
    """
    Generate a histogram of the differences between the original data and the sampled data.
    Parameters:
    - k (int): The sampling interval. Only every kth point will be included in the sampled data.
    Returns:
    None
    """
    df  = generate_greenhouse_data("datasets/greenhouse.csv")
    df = df.tail(1000)
    df_sampled = sample_every_kth_point(df, k)
@@ -192,9 +347,9 @@ def histogram_sample_every_kth_point(k=10):
 # comparaison_mean(df)
 # Temperature rate of change over the day
-df = generate_greenhouse_data("datasets/greenhouse.csv")
+# df = generate_greenhouse_data("datasets/greenhouse.csv")
-hcor = hourly_rate_of_change(df)
+# hcor = hourly_rate_of_change(df)
-print(hcor)
+# print(hcor)
 # hcor.plot()
 # plt.xlabel("Hour of the day")
 # plt.ylabel("Average absolute rate of change (°C/hour)")
--- a/poll.py
+++ b/poll.py
@@ -2,6 +2,22 @@ import datetime
 from analyze import hourly_rate_of_change
 def sample_every_kth_point(df, k):
    """
    Sample every k-th point from a DataFrame.
    Parameters:
    - df: pandas DataFrame
        The DataFrame from which to sample the points.
    - k: int
        The interval between sampled points.
    Returns:
    - sampled_df: pandas DataFrame
        The DataFrame containing the sampled points.
    Raises:
    - ValueError: If k is not a positive integer or if k exceeds the number of rows in the DataFrame.
    """
    # Validate the input to ensure k is positive and does not exceed the DataFrame length
    if k <= 0:
        raise ValueError("k must be a positive integer.")
@@ -13,6 +29,17 @@ def sample_every_kth_point(df, k):
    return sampled_df
 def optimal_sample(df, threshold_dT=0.5):
    """
    Returns a subset of the input DataFrame `df` containing rows that have a significant change in value.
    Parameters:
        df (pandas.DataFrame): The input DataFrame.
        threshold_dT (float, optional): The threshold value for the change in value. Defaults to 0.5.
    Returns:
        pandas.DataFrame: A subset of the input DataFrame `df` containing rows with significant changes in value.
    """
    t0 = df["time"].iloc[0]
    indices = [0]
    times = [t0]
@@ -23,22 +50,45 @@ def optimal_sample(df, threshold_dT=0.5):
            indices.append(i)
    return df.iloc[indices]
-def sample_reglin(df,max_dT=0.5, max_poll_interval=2 * 3600):
+def sample_reglin(df, max_dT=0.5, max_poll_interval=2 * 3600):
    """
    Returns a subset of the input DataFrame `df` by sampling points based on a linear regression algorithm.
    Parameters:
    - df (pandas.DataFrame): The input DataFrame containing the time series data.
    - max_dT (float): The value difference that should be considered significant enough to add a new value.
                      Defaults to 0.5.
    - max_poll_interval (int): The maximum time interval allowed between the first and last point in the subset.
                               Defaults to 2 hours (2 * 3600 seconds).
    Returns:
    - pandas.DataFrame: A subset of the input DataFrame `df` containing the sampled points.
    Raises:
    - ValueError: If there is no point before the specified date.
    """
    indices = []
    def get_first_point_after(date):
-        if(df[df['time'] > date].empty):
+        if df[df['time'] > date].empty:
            raise ValueError("No point before the date")
        return df[df['time'] > date].iloc[0]
    # Get first two points
    t0 = df["time"].iloc[0]
    t1 = df["time"].iloc[1]
    while True:
        v0 = df[df["time"] == t0]["value"].values[0]
        v1 = df[df["time"] == t1]["value"].values[0]
        # Calculate the slope
        s = abs((v1 - v0) / (t1 - t0).total_seconds())
-        #add max_dT/s to t1
+
-        new_t = t1 + datetime.timedelta(seconds=min(max_dT/s, max_poll_interval))
+        # Add max_dT/s to t1
        new_t = t1 + datetime.timedelta(seconds=min(max_dT / s, max_poll_interval))
        try:
            new_t = get_first_point_after(new_t)["time"]
            indices.append(df[df["time"] == new_t].index[0])
@@ -46,12 +96,24 @@ def sample_reglin(df,max_dT=0.5, max_poll_interval=2 * 3600):
            t1 = new_t
        except ValueError:
            break
    return df.loc[indices]
-def sample_avg_rate_of_change(df,poll_rate):
+def sample_avg_rate_of_change(df, poll_rate):
    """
    Calculate the sample average rate of change for a given DataFrame.
    Args:
        df (pandas.DataFrame): The DataFrame containing the data.
        poll_rate (pandas.Series): The Series containing the poll rates for each hour.
    Returns:
        pandas.DataFrame: The subset of the DataFrame with the indices where the rate of change exceeds the poll rate.
    """
    indices = [0]
    for i in range(len(df)):
        current_hour = df["time"].iloc[i].hour
-        if(df["time"].iloc[i] - df["time"].iloc[indices[-1]] > datetime.timedelta(seconds = poll_rate.iloc[current_hour])):
+        if df["time"].iloc[i] - df["time"].iloc[indices[-1]] > datetime.timedelta(seconds=poll_rate.iloc[current_hour]):
            indices.append(i)
    return df.iloc[indices]