ajout fonctions de base pour parse des données et les analyser

This commit is contained in:
Quentin Roussel
2024-05-01 23:30:31 +02:00
parent 239aa49d7f
commit e6cfd468a6
10 changed files with 167653 additions and 0 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

24
analyze.py Normal file
View File

@@ -0,0 +1,24 @@
import pandas as pd
import matplotlib.pyplot as plt
def distribution_of_differences(df, column_name):
# Check if the column exists in the DataFrame
if column_name not in df.columns:
raise ValueError(f"The column '{column_name}' does not exist in the DataFrame.")
# Calculate differences between consecutive rows for the specified column
differences = df[column_name].diff().abs()
# The first element of differences will be NaN since there's no previous element for the first row
differences = differences.dropna() # Remove NaN values
return differences
def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"):
plt.figure(figsize=(8, 4)) # Set the figure size for better readability
plt.hist(data_series, bins=bins, color='blue', alpha=0.7, edgecolor='black')
plt.title(title)
plt.xlabel('Absolute Difference')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

167517
datasets/greenhouse.csv Normal file

File diff suppressed because it is too large Load Diff

12
env/.gitignore vendored Normal file
View File

@@ -0,0 +1,12 @@
# Virtualenv
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
.Python
[Bb]in
[Ii]nclude
[Ll]ib
[Ll]ib64
[Ll]ocal
[Ss]cripts
pyvenv.cfg
.venv
pip-selfcheck.json

52
generate_data.py Normal file
View File

@@ -0,0 +1,52 @@
import pandas as pd
import numpy as np
from opensimplex import OpenSimplex
import datetime
def generate_greenhouse_data(filepath):
# Read the CSV file into a DataFrame, parsing 'time' as datetime
df = pd.read_csv(filepath, parse_dates=["time"], dtype={"id": str, "value": float})
# Compute the absolute differences between consecutive temperature readings
df['diff'] = df['value'].diff().abs()
# Initial value for 'diff' will be NaN; we can fill it with 0 or a small number
df['diff'] = df['diff'].fillna(0)
# Filter the DataFrame:
# 1. Exclude temperature values that are too high (>50) or too low (<-10)
# 2. Exclude rows where the difference from the previous reading is greater than 6
filtered_df = df[(df['value'] > 0) & (df['value'] < 50) & (df['diff'] <= 6)]
# Drop the 'diff' column as it's no longer needed after filtering
filtered_df = filtered_df.drop(columns=['diff'])
return filtered_df
def generate_simplex(start_time=None, end_time=None, interval=600, max_temp=30, min_temp=10, frequency=10):
# Default time settings if none provided
if end_time is None:
end_time = datetime.datetime.now()
if start_time is None:
start_time = end_time - datetime.timedelta(days=1)
# Calculate the number of samples needed based on the interval
total_seconds = int((end_time - start_time).total_seconds())
steps = total_seconds // interval
# Time array
times = [start_time + datetime.timedelta(seconds=i * interval) for i in range(steps + 1)]
# Simplex noise generator
simplex = OpenSimplex(seed=np.random.randint(0, 1000))
# Generate noise values and scale them
temperatures = [simplex.noise2(x=i / frequency, y=0) for i in range(steps + 1)]
# Map Simplex noise output (usually in range [-1, 1]) to the [min_temp, max_temp]
scaled_temperatures = min_temp + (np.array(temperatures) + 1) / 2 * (max_temp - min_temp)
# Create DataFrame
df = pd.DataFrame({'time': times, 'value': scaled_temperatures})
return df

36
main.py Normal file
View File

@@ -0,0 +1,36 @@
import pandas as pd
import matplotlib.pyplot as plt
from generate_data import *
from analyze import *
from poll import *
def plot_temperature_data(df, recent_count=None):
plt.figure(figsize=(10, 5))
# Check if recent_count is specified and valid
if recent_count is not None and recent_count > 0:
df = df.tail(recent_count) # Slice the DataFrame to get the last 'recent_count' rows
plt.plot(df['time'], df['value'], label='Temperature', color='tab:red')
plt.title('Temperature Over Time')
plt.xlabel('Time')
plt.ylabel('Temperature (°C)')
plt.grid(True)
plt.legend()
plt.xticks(rotation=45) # Rotates the x-axis labels to make them more readable
plt.tight_layout() # Adjusts subplot params so that the subplot(s) fits in to the figure area.
plt.show()
# Load the data from the CSV file
df = generate_greenhouse_data("datasets/greenhouse.csv")
plot_temperature_data(df)
df2 = sample_every_kth_point(df,50)
diff1 = distribution_of_differences(df, 'value')
diff2 = distribution_of_differences(df2, 'value')
diff1 = diff1[diff1 <= 10]
diff2 = diff2[diff2 <= 10]
plot_histogram(diff1,bins=20, title='Distribution of Absolute Differences (Original Data)')
plot_histogram(diff2, bins=20, title='Distribution of Absolute Differences (Sampled Data)')

10
poll.py Normal file
View File

@@ -0,0 +1,10 @@
def sample_every_kth_point(df, k):
# Validate the input to ensure k is positive and does not exceed the DataFrame length
if k <= 0:
raise ValueError("k must be a positive integer.")
if k > len(df):
raise ValueError("k is greater than the number of rows in the DataFrame.")
# Sample every k-th point
sampled_df = df.iloc[::k]
return sampled_df

2
requirements.txt Normal file
View File

@@ -0,0 +1,2 @@
pandas
noise