mirror of
https://git.roussel.pro/telecom-paris/GIN206.git
synced 2026-02-09 02:30:17 +01:00
added documentation
This commit is contained in:
59
analyze.py
59
analyze.py
@@ -2,6 +2,21 @@ import pandas as pd
|
|||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
def error(df, df_original, column_name):
|
def error(df, df_original, column_name):
|
||||||
|
"""
|
||||||
|
Calculate the error between the values in a column of a DataFrame and the last value before each timestamp.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df (pandas.DataFrame): The DataFrame containing the values.
|
||||||
|
df_original (pandas.DataFrame): The original DataFrame containing the timestamps and values.
|
||||||
|
column_name (str): The name of the column to calculate the error for.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: A list of absolute differences between the values in the specified column and the last value before each timestamp.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the specified column does not exist in the DataFrame.
|
||||||
|
"""
|
||||||
|
|
||||||
diff = []
|
diff = []
|
||||||
# Check if the column exists in the DataFrame
|
# Check if the column exists in the DataFrame
|
||||||
if column_name not in df.columns:
|
if column_name not in df.columns:
|
||||||
@@ -23,6 +38,19 @@ def error(df, df_original, column_name):
|
|||||||
|
|
||||||
|
|
||||||
def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"):
|
def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"):
|
||||||
|
"""
|
||||||
|
Plots a histogram of the given data series.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- data_series (array-like): The data series to plot the histogram for.
|
||||||
|
- bins (int): The number of bins to use for the histogram. Default is 10.
|
||||||
|
- title (str): The title of the histogram plot. Default is "Distribution of Absolute Differences".
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
plt.figure(figsize=(8, 4)) # Set the figure size for better readability
|
plt.figure(figsize=(8, 4)) # Set the figure size for better readability
|
||||||
plt.hist(data_series, bins=bins, color='blue', alpha=0.7, edgecolor='black')
|
plt.hist(data_series, bins=bins, color='blue', alpha=0.7, edgecolor='black')
|
||||||
plt.title(title)
|
plt.title(title)
|
||||||
@@ -32,15 +60,40 @@ def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differe
|
|||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def compute_efficiency(df):
|
def compute_efficiency(df):
|
||||||
#compute the time differnece between the first and last point
|
"""
|
||||||
|
Compute the efficiency of a data frame. i.e the time taken to collect each data point.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
df (pandas.DataFrame): The input data frame.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: The efficiency value.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# compute the time difference between the first and last point
|
||||||
time_diff = df["time"].iloc[-1] - df["time"].iloc[0]
|
time_diff = df["time"].iloc[-1] - df["time"].iloc[0]
|
||||||
#compute the number of points
|
# compute the number of points
|
||||||
num_points = len(df)
|
num_points = len(df)
|
||||||
#compute the efficiency
|
# compute the efficiency
|
||||||
efficiency = time_diff.total_seconds() / num_points
|
efficiency = time_diff.total_seconds() / num_points
|
||||||
return efficiency
|
return efficiency
|
||||||
|
|
||||||
def hourly_rate_of_change(df):
|
def hourly_rate_of_change(df):
|
||||||
|
"""
|
||||||
|
Calculate the average absolute rate of change per hour for a given DataFrame.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df (pandas.DataFrame): The DataFrame containing the data.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pandas.Series: A Series containing the average absolute rate of change per hour.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the DataFrame does not include 'time' and 'value' columns, or if it is empty.
|
||||||
|
ValueError: If the 'time' column is not of datetime type.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
# Check if required columns exist
|
# Check if required columns exist
|
||||||
if 'time' not in df.columns or 'value' not in df.columns:
|
if 'time' not in df.columns or 'value' not in df.columns:
|
||||||
raise ValueError("DataFrame must include 'time' and 'value' columns.")
|
raise ValueError("DataFrame must include 'time' and 'value' columns.")
|
||||||
|
|||||||
@@ -4,6 +4,19 @@ from opensimplex import OpenSimplex
|
|||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
def generate_greenhouse_data(filepath):
|
def generate_greenhouse_data(filepath):
|
||||||
|
"""
|
||||||
|
Generate filtered greenhouse data from a CSV file.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
filepath (str): The path to the CSV file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pandas.DataFrame: The filtered greenhouse data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Rest of the code...
|
||||||
|
def generate_greenhouse_data(filepath):
|
||||||
|
|
||||||
# Read the CSV file into a DataFrame, parsing 'time' as datetime
|
# Read the CSV file into a DataFrame, parsing 'time' as datetime
|
||||||
df = pd.read_csv(filepath, parse_dates=["time"], dtype={"id": str, "value": float})
|
df = pd.read_csv(filepath, parse_dates=["time"], dtype={"id": str, "value": float})
|
||||||
|
|
||||||
@@ -25,6 +38,21 @@ def generate_greenhouse_data(filepath):
|
|||||||
|
|
||||||
|
|
||||||
def generate_simplex(start_time=None, end_time=None, interval=600, max_temp=30, min_temp=10, frequency=10):
|
def generate_simplex(start_time=None, end_time=None, interval=600, max_temp=30, min_temp=10, frequency=10):
|
||||||
|
"""
|
||||||
|
Generate a DataFrame with time and temperature values using Simplex noise.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- start_time (datetime): The start time for generating the data. If not provided, it defaults to 1 day before the end_time.
|
||||||
|
- end_time (datetime): The end time for generating the data. If not provided, it defaults to the current time.
|
||||||
|
- interval (int): The time interval in seconds between each data point. Defaults to 600 seconds (10 minutes).
|
||||||
|
- max_temp (float): The maximum temperature value. Defaults to 30.
|
||||||
|
- min_temp (float): The minimum temperature value. Defaults to 10.
|
||||||
|
- frequency (int): The frequency parameter for the Simplex noise generator. Defaults to 10.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- df (DataFrame): A pandas DataFrame with 'time' and 'value' columns representing the generated time and temperature values.
|
||||||
|
"""
|
||||||
|
|
||||||
# Default time settings if none provided
|
# Default time settings if none provided
|
||||||
if end_time is None:
|
if end_time is None:
|
||||||
end_time = datetime.datetime.now()
|
end_time = datetime.datetime.now()
|
||||||
|
|||||||
185
main.py
185
main.py
@@ -4,11 +4,37 @@ from generate_data import *
|
|||||||
from analyze import *
|
from analyze import *
|
||||||
from poll import *
|
from poll import *
|
||||||
|
|
||||||
# sort two lists based on the first list
|
def sort(X, Y):
|
||||||
def sort(X,Y):
|
"""
|
||||||
return zip(*sorted(zip(X,Y)))
|
Sorts two lists X and Y in ascending order based on the values in X.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
X (list): The first list to be sorted.
|
||||||
|
Y (list): The second list to be sorted.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: A tuple containing the sorted X and Y lists.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
X = [3, 1, 2]
|
||||||
|
Y = ['c', 'a', 'b']
|
||||||
|
sorted_X, sorted_Y = sort(X, Y)
|
||||||
|
# sorted_X: [1, 2, 3]
|
||||||
|
# sorted_Y: ['a', 'b', 'c']
|
||||||
|
"""
|
||||||
|
return zip(*sorted(zip(X, Y)))
|
||||||
|
|
||||||
def plot_temperature_data(df, recent_count=None):
|
def plot_temperature_data(df, recent_count=None):
|
||||||
|
"""
|
||||||
|
Plots the temperature data from a DataFrame.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df (pandas.DataFrame): The DataFrame containing the temperature data.
|
||||||
|
recent_count (int, optional): The number of recent data points to plot. If specified, only the last 'recent_count' rows will be plotted. Defaults to None.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
plt.figure(figsize=(5, 5))
|
plt.figure(figsize=(5, 5))
|
||||||
|
|
||||||
# Check if recent_count is specified and valid
|
# Check if recent_count is specified and valid
|
||||||
@@ -25,6 +51,20 @@ def plot_temperature_data(df, recent_count=None):
|
|||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def test_sample_every_kth_point(df):
|
def test_sample_every_kth_point(df):
|
||||||
|
"""
|
||||||
|
Test the sample_every_kth_point function with different values of k.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- df: The input DataFrame containing the data.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- X: The array of values used for sampling.
|
||||||
|
- EFFICIENCY: The efficiency values for each sampling.
|
||||||
|
- MEAN: The mean error values for each sampling.
|
||||||
|
- MEDIAN: The median error values for each sampling.
|
||||||
|
- STD: The standard deviation of error values for each sampling.
|
||||||
|
"""
|
||||||
|
|
||||||
X = np.arange(1, 10, 1)
|
X = np.arange(1, 10, 1)
|
||||||
MEAN = []
|
MEAN = []
|
||||||
STD = []
|
STD = []
|
||||||
@@ -46,24 +86,59 @@ def test_sample_every_kth_point(df):
|
|||||||
|
|
||||||
|
|
||||||
def example_sample_every_kth_point(k=10):
|
def example_sample_every_kth_point(k=10):
|
||||||
|
"""
|
||||||
|
Example function that demonstrates how to sample every kth point from a dataframe and plot the temperature data.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
k (int): The sampling interval. Default is 10.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
df = df.tail(150)
|
df = df.tail(150)
|
||||||
df = sample_every_kth_point(df, k)
|
df = sample_every_kth_point(df, k)
|
||||||
plot_temperature_data(df)
|
plot_temperature_data(df)
|
||||||
|
|
||||||
def example_sample_reglin():
|
def example_sample_reglin():
|
||||||
|
"""
|
||||||
|
This function demonstrates the usage of the sample_reglin function.
|
||||||
|
It generates greenhouse data, selects the last 150 rows, applies the sample_reglin function,
|
||||||
|
and plots the temperature data.
|
||||||
|
"""
|
||||||
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
df = df.tail(150)
|
df = df.tail(150)
|
||||||
df = sample_reglin(df)
|
df = sample_reglin(df)
|
||||||
plot_temperature_data(df)
|
plot_temperature_data(df)
|
||||||
|
|
||||||
def exaample_optimal_sample(dT = 0.3):
|
def example_optimal_sample(dT = 0.3):
|
||||||
|
"""
|
||||||
|
Example function that demonstrates the usage of the optimal_sample function.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
dT (float): The threshold value for temperature difference. Default is 0.3.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
df = df.tail(150)
|
df = df.tail(150)
|
||||||
df = optimal_sample(df, threshold_dT=dT)
|
df = optimal_sample(df, threshold_dT=dT)
|
||||||
plot_temperature_data(df)
|
plot_temperature_data(df)
|
||||||
|
|
||||||
def example_sample_avg_rate_of_change():
|
def example_sample_avg_rate_of_change():
|
||||||
|
"""
|
||||||
|
This function demonstrates how to calculate the sample average rate of change for temperature data.
|
||||||
|
It generates greenhouse data, calculates the hourly rate of change, selects the last 150 records,
|
||||||
|
and then calculates the sample average rate of change based on the hourly rate of change.
|
||||||
|
Finally, it plots the temperature data.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
None
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
hroc = hourly_rate_of_change(df)
|
hroc = hourly_rate_of_change(df)
|
||||||
df = df.tail(150)
|
df = df.tail(150)
|
||||||
@@ -71,6 +146,25 @@ def example_sample_avg_rate_of_change():
|
|||||||
plot_temperature_data(df)
|
plot_temperature_data(df)
|
||||||
|
|
||||||
def test_sample_reglin(df):
|
def test_sample_reglin(df):
|
||||||
|
"""
|
||||||
|
Perform a test on the sample_reglin function with different values of max_dT.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- df: DataFrame
|
||||||
|
The input DataFrame containing temperature data.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- X: ndarray
|
||||||
|
An array of values ranging from 0.4 to 3 with a step of 0.05.
|
||||||
|
- EFFICIENCY: list
|
||||||
|
A list of efficiency values calculated for each max_dT value.
|
||||||
|
- MEAN: list
|
||||||
|
A list of mean error values calculated for each max_dT value.
|
||||||
|
- MEDIAN: list
|
||||||
|
A list of median error values calculated for each max_dT value.
|
||||||
|
- STD: list
|
||||||
|
A list of standard deviation error values calculated for each max_dT value.
|
||||||
|
"""
|
||||||
X = np.arange(0.4, 3, 0.05)
|
X = np.arange(0.4, 3, 0.05)
|
||||||
MEAN = []
|
MEAN = []
|
||||||
STD = []
|
STD = []
|
||||||
@@ -91,6 +185,20 @@ def test_sample_reglin(df):
|
|||||||
|
|
||||||
|
|
||||||
def test_optimal_sample(df):
|
def test_optimal_sample(df):
|
||||||
|
"""
|
||||||
|
Test the optimal sample function with different threshold values.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df (pandas.DataFrame): The input DataFrame containing temperature data.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: A tuple containing the following lists:
|
||||||
|
- X (numpy.ndarray): An array of threshold values.
|
||||||
|
- EFFICIENCY (list): A list of efficiency values for each threshold.
|
||||||
|
- MEAN (list): A list of mean error values for each threshold.
|
||||||
|
- MEDIAN (list): A list of median error values for each threshold.
|
||||||
|
- STD (list): A list of standard deviation error values for each threshold.
|
||||||
|
"""
|
||||||
X = np.arange(0.1, 3, 0.05)
|
X = np.arange(0.1, 3, 0.05)
|
||||||
MEAN = []
|
MEAN = []
|
||||||
STD = []
|
STD = []
|
||||||
@@ -109,7 +217,24 @@ def test_optimal_sample(df):
|
|||||||
EFFICIENCY.append(compute_efficiency(df_sampeld))
|
EFFICIENCY.append(compute_efficiency(df_sampeld))
|
||||||
return X, EFFICIENCY, MEAN, MEDIAN, STD
|
return X, EFFICIENCY, MEAN, MEDIAN, STD
|
||||||
|
|
||||||
def test_sample_avg_rate_of_change(df,hourly_rate_of_change):
|
def test_sample_avg_rate_of_change(df, hourly_rate_of_change):
|
||||||
|
"""
|
||||||
|
Test the sample average rate of change.
|
||||||
|
|
||||||
|
This function takes a DataFrame `df` and the `hourly_rate_of_change` as input.
|
||||||
|
It performs a series of calculations on the data and returns the results.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- df (pandas.DataFrame): The input DataFrame containing the data.
|
||||||
|
- hourly_rate_of_change (float): The hourly rate of change.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- X (numpy.ndarray): An array of values ranging from 0.01 to 3 with a step of 0.05.
|
||||||
|
- EFFICIENCY (list): A list of efficiency values calculated for each sample.
|
||||||
|
- MEAN (list): A list of mean values calculated for each sample.
|
||||||
|
- MEDIAN (list): A list of median values calculated for each sample.
|
||||||
|
- STD (list): A list of standard deviation values calculated for each sample.
|
||||||
|
"""
|
||||||
X = np.arange(0.01, 3, 0.05)
|
X = np.arange(0.01, 3, 0.05)
|
||||||
MEAN = []
|
MEAN = []
|
||||||
STD = []
|
STD = []
|
||||||
@@ -128,35 +253,56 @@ def test_sample_avg_rate_of_change(df,hourly_rate_of_change):
|
|||||||
EFFICIENCY.append(compute_efficiency(df_sampled))
|
EFFICIENCY.append(compute_efficiency(df_sampled))
|
||||||
return X, EFFICIENCY, MEAN, MEDIAN, STD
|
return X, EFFICIENCY, MEAN, MEDIAN, STD
|
||||||
|
|
||||||
def comparaison_mean(df,limit=1000):
|
def comparaison_mean(df, limit=1000):
|
||||||
|
"""
|
||||||
|
Compare different sampling methods based on their mean and efficiency.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- df: DataFrame
|
||||||
|
The input DataFrame containing the data.
|
||||||
|
- limit: int, optional
|
||||||
|
The number of rows to consider from the end of the DataFrame. Default is 1000.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
plt.figure(figsize=(10, 5))
|
plt.figure(figsize=(10, 5))
|
||||||
hroc = hourly_rate_of_change(df)
|
hroc = hourly_rate_of_change(df)
|
||||||
df = df.tail(limit)
|
df = df.tail(limit)
|
||||||
X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_every_kth_point(df)
|
X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_every_kth_point(df)
|
||||||
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
|
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
|
||||||
plt.plot( MEAN,EFFICIENCY, label="Constant Polling Interval", marker='x')
|
plt.plot(MEAN, EFFICIENCY, label="Constant Polling Interval", marker='x')
|
||||||
|
|
||||||
X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_reglin(df)
|
X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_reglin(df)
|
||||||
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
|
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
|
||||||
plt.plot( MEAN,EFFICIENCY, label="Linear Regression", marker='x')
|
plt.plot(MEAN, EFFICIENCY, label="Linear Regression", marker='x')
|
||||||
|
|
||||||
X, EFFICIENCY, MEAN, MEDIAN, STD = test_optimal_sample(df)
|
X, EFFICIENCY, MEAN, MEDIAN, STD = test_optimal_sample(df)
|
||||||
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
|
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
|
||||||
plt.plot( MEAN,EFFICIENCY, label="Optimal Polling rate", marker='x')
|
plt.plot(MEAN, EFFICIENCY, label="Optimal Polling rate", marker='x')
|
||||||
|
|
||||||
X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_avg_rate_of_change(df,hroc)
|
X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_avg_rate_of_change(df, hroc)
|
||||||
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
|
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
|
||||||
plt.plot( MEAN,EFFICIENCY, label="Hourly Rate of Change", marker='x')
|
plt.plot(MEAN, EFFICIENCY, label="Hourly Rate of Change", marker='x')
|
||||||
|
|
||||||
plt.ylabel("Average seconds between polls")
|
plt.ylabel("Average seconds between polls")
|
||||||
plt.xlabel("Average error")
|
plt.xlabel("Average error")
|
||||||
plt.ylim(0, 8000)
|
plt.ylim(0, 8000)
|
||||||
plt.xlim(0,1.3)
|
plt.xlim(0, 1.3)
|
||||||
|
|
||||||
plt.legend()
|
plt.legend()
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def example_optimal_sample(dT = 0.3):
|
def example_optimal_sample(dT = 0.3):
|
||||||
|
"""
|
||||||
|
This function demonstrates how to use the `optimal_sample` function to generate an optimal sample of greenhouse data.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
dT (float): The threshold value for temperature difference. Default is 0.3.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
df = df.tail(1000)
|
df = df.tail(1000)
|
||||||
df = optimal_sample(df, threshold_dT=dT)
|
df = optimal_sample(df, threshold_dT=dT)
|
||||||
@@ -169,6 +315,15 @@ def example_optimal_sample(dT = 0.3):
|
|||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def histogram_sample_every_kth_point(k=10):
|
def histogram_sample_every_kth_point(k=10):
|
||||||
|
"""
|
||||||
|
Generate a histogram of the differences between the original data and the sampled data.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- k (int): The sampling interval. Only every kth point will be included in the sampled data.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
df = df.tail(1000)
|
df = df.tail(1000)
|
||||||
df_sampled = sample_every_kth_point(df, k)
|
df_sampled = sample_every_kth_point(df, k)
|
||||||
@@ -192,9 +347,9 @@ def histogram_sample_every_kth_point(k=10):
|
|||||||
# comparaison_mean(df)
|
# comparaison_mean(df)
|
||||||
|
|
||||||
# Temperature rate of change over the day
|
# Temperature rate of change over the day
|
||||||
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
# df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
hcor = hourly_rate_of_change(df)
|
# hcor = hourly_rate_of_change(df)
|
||||||
print(hcor)
|
# print(hcor)
|
||||||
# hcor.plot()
|
# hcor.plot()
|
||||||
# plt.xlabel("Hour of the day")
|
# plt.xlabel("Hour of the day")
|
||||||
# plt.ylabel("Average absolute rate of change (°C/hour)")
|
# plt.ylabel("Average absolute rate of change (°C/hour)")
|
||||||
|
|||||||
74
poll.py
74
poll.py
@@ -2,6 +2,22 @@ import datetime
|
|||||||
from analyze import hourly_rate_of_change
|
from analyze import hourly_rate_of_change
|
||||||
|
|
||||||
def sample_every_kth_point(df, k):
|
def sample_every_kth_point(df, k):
|
||||||
|
"""
|
||||||
|
Sample every k-th point from a DataFrame.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- df: pandas DataFrame
|
||||||
|
The DataFrame from which to sample the points.
|
||||||
|
- k: int
|
||||||
|
The interval between sampled points.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- sampled_df: pandas DataFrame
|
||||||
|
The DataFrame containing the sampled points.
|
||||||
|
Raises:
|
||||||
|
- ValueError: If k is not a positive integer or if k exceeds the number of rows in the DataFrame.
|
||||||
|
"""
|
||||||
|
|
||||||
# Validate the input to ensure k is positive and does not exceed the DataFrame length
|
# Validate the input to ensure k is positive and does not exceed the DataFrame length
|
||||||
if k <= 0:
|
if k <= 0:
|
||||||
raise ValueError("k must be a positive integer.")
|
raise ValueError("k must be a positive integer.")
|
||||||
@@ -13,6 +29,17 @@ def sample_every_kth_point(df, k):
|
|||||||
return sampled_df
|
return sampled_df
|
||||||
|
|
||||||
def optimal_sample(df, threshold_dT=0.5):
|
def optimal_sample(df, threshold_dT=0.5):
|
||||||
|
"""
|
||||||
|
Returns a subset of the input DataFrame `df` containing rows that have a significant change in value.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
df (pandas.DataFrame): The input DataFrame.
|
||||||
|
threshold_dT (float, optional): The threshold value for the change in value. Defaults to 0.5.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pandas.DataFrame: A subset of the input DataFrame `df` containing rows with significant changes in value.
|
||||||
|
"""
|
||||||
|
|
||||||
t0 = df["time"].iloc[0]
|
t0 = df["time"].iloc[0]
|
||||||
indices = [0]
|
indices = [0]
|
||||||
times = [t0]
|
times = [t0]
|
||||||
@@ -23,22 +50,45 @@ def optimal_sample(df, threshold_dT=0.5):
|
|||||||
indices.append(i)
|
indices.append(i)
|
||||||
return df.iloc[indices]
|
return df.iloc[indices]
|
||||||
|
|
||||||
def sample_reglin(df,max_dT=0.5, max_poll_interval=2 * 3600):
|
def sample_reglin(df, max_dT=0.5, max_poll_interval=2 * 3600):
|
||||||
|
"""
|
||||||
|
Returns a subset of the input DataFrame `df` by sampling points based on a linear regression algorithm.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- df (pandas.DataFrame): The input DataFrame containing the time series data.
|
||||||
|
- max_dT (float): The value difference that should be considered significant enough to add a new value.
|
||||||
|
Defaults to 0.5.
|
||||||
|
- max_poll_interval (int): The maximum time interval allowed between the first and last point in the subset.
|
||||||
|
Defaults to 2 hours (2 * 3600 seconds).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- pandas.DataFrame: A subset of the input DataFrame `df` containing the sampled points.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
- ValueError: If there is no point before the specified date.
|
||||||
|
|
||||||
|
"""
|
||||||
indices = []
|
indices = []
|
||||||
|
|
||||||
def get_first_point_after(date):
|
def get_first_point_after(date):
|
||||||
if(df[df['time'] > date].empty):
|
if df[df['time'] > date].empty:
|
||||||
raise ValueError("No point before the date")
|
raise ValueError("No point before the date")
|
||||||
return df[df['time'] > date].iloc[0]
|
return df[df['time'] > date].iloc[0]
|
||||||
|
|
||||||
# Get first two points
|
# Get first two points
|
||||||
t0 = df["time"].iloc[0]
|
t0 = df["time"].iloc[0]
|
||||||
t1 = df["time"].iloc[1]
|
t1 = df["time"].iloc[1]
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
v0 = df[df["time"] == t0]["value"].values[0]
|
v0 = df[df["time"] == t0]["value"].values[0]
|
||||||
v1 = df[df["time"] == t1]["value"].values[0]
|
v1 = df[df["time"] == t1]["value"].values[0]
|
||||||
|
|
||||||
# Calculate the slope
|
# Calculate the slope
|
||||||
s = abs((v1 - v0) / (t1 - t0).total_seconds())
|
s = abs((v1 - v0) / (t1 - t0).total_seconds())
|
||||||
#add max_dT/s to t1
|
|
||||||
new_t = t1 + datetime.timedelta(seconds=min(max_dT/s, max_poll_interval))
|
# Add max_dT/s to t1
|
||||||
|
new_t = t1 + datetime.timedelta(seconds=min(max_dT / s, max_poll_interval))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
new_t = get_first_point_after(new_t)["time"]
|
new_t = get_first_point_after(new_t)["time"]
|
||||||
indices.append(df[df["time"] == new_t].index[0])
|
indices.append(df[df["time"] == new_t].index[0])
|
||||||
@@ -46,12 +96,24 @@ def sample_reglin(df,max_dT=0.5, max_poll_interval=2 * 3600):
|
|||||||
t1 = new_t
|
t1 = new_t
|
||||||
except ValueError:
|
except ValueError:
|
||||||
break
|
break
|
||||||
|
|
||||||
return df.loc[indices]
|
return df.loc[indices]
|
||||||
|
|
||||||
def sample_avg_rate_of_change(df,poll_rate):
|
def sample_avg_rate_of_change(df, poll_rate):
|
||||||
|
"""
|
||||||
|
Calculate the sample average rate of change for a given DataFrame.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df (pandas.DataFrame): The DataFrame containing the data.
|
||||||
|
poll_rate (pandas.Series): The Series containing the poll rates for each hour.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pandas.DataFrame: The subset of the DataFrame with the indices where the rate of change exceeds the poll rate.
|
||||||
|
|
||||||
|
"""
|
||||||
indices = [0]
|
indices = [0]
|
||||||
for i in range(len(df)):
|
for i in range(len(df)):
|
||||||
current_hour = df["time"].iloc[i].hour
|
current_hour = df["time"].iloc[i].hour
|
||||||
if(df["time"].iloc[i] - df["time"].iloc[indices[-1]] > datetime.timedelta(seconds = poll_rate.iloc[current_hour])):
|
if df["time"].iloc[i] - df["time"].iloc[indices[-1]] > datetime.timedelta(seconds=poll_rate.iloc[current_hour]):
|
||||||
indices.append(i)
|
indices.append(i)
|
||||||
return df.iloc[indices]
|
return df.iloc[indices]
|
||||||
|
|||||||
Reference in New Issue
Block a user