mirror of
https://git.roussel.pro/telecom-paris/GIN206.git
synced 2026-02-09 10:40:17 +01:00
ajout fonctions de base pour parse des données et les analyser
This commit is contained in:
BIN
__pycache__/analyze.cpython-310.pyc
Normal file
BIN
__pycache__/analyze.cpython-310.pyc
Normal file
Binary file not shown.
BIN
__pycache__/generate_data.cpython-310.pyc
Normal file
BIN
__pycache__/generate_data.cpython-310.pyc
Normal file
Binary file not shown.
BIN
__pycache__/poll.cpython-310.pyc
Normal file
BIN
__pycache__/poll.cpython-310.pyc
Normal file
Binary file not shown.
24
analyze.py
Normal file
24
analyze.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
def distribution_of_differences(df, column_name):
|
||||||
|
# Check if the column exists in the DataFrame
|
||||||
|
if column_name not in df.columns:
|
||||||
|
raise ValueError(f"The column '{column_name}' does not exist in the DataFrame.")
|
||||||
|
|
||||||
|
# Calculate differences between consecutive rows for the specified column
|
||||||
|
differences = df[column_name].diff().abs()
|
||||||
|
|
||||||
|
# The first element of differences will be NaN since there's no previous element for the first row
|
||||||
|
differences = differences.dropna() # Remove NaN values
|
||||||
|
|
||||||
|
return differences
|
||||||
|
|
||||||
|
def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"):
|
||||||
|
plt.figure(figsize=(8, 4)) # Set the figure size for better readability
|
||||||
|
plt.hist(data_series, bins=bins, color='blue', alpha=0.7, edgecolor='black')
|
||||||
|
plt.title(title)
|
||||||
|
plt.xlabel('Absolute Difference')
|
||||||
|
plt.ylabel('Frequency')
|
||||||
|
plt.grid(True)
|
||||||
|
plt.show()
|
||||||
167517
datasets/greenhouse.csv
Normal file
167517
datasets/greenhouse.csv
Normal file
File diff suppressed because it is too large
Load Diff
12
env/.gitignore
vendored
Normal file
12
env/.gitignore
vendored
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# Virtualenv
|
||||||
|
# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
|
||||||
|
.Python
|
||||||
|
[Bb]in
|
||||||
|
[Ii]nclude
|
||||||
|
[Ll]ib
|
||||||
|
[Ll]ib64
|
||||||
|
[Ll]ocal
|
||||||
|
[Ss]cripts
|
||||||
|
pyvenv.cfg
|
||||||
|
.venv
|
||||||
|
pip-selfcheck.json
|
||||||
52
generate_data.py
Normal file
52
generate_data.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from opensimplex import OpenSimplex
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
def generate_greenhouse_data(filepath):
|
||||||
|
# Read the CSV file into a DataFrame, parsing 'time' as datetime
|
||||||
|
df = pd.read_csv(filepath, parse_dates=["time"], dtype={"id": str, "value": float})
|
||||||
|
|
||||||
|
# Compute the absolute differences between consecutive temperature readings
|
||||||
|
df['diff'] = df['value'].diff().abs()
|
||||||
|
|
||||||
|
# Initial value for 'diff' will be NaN; we can fill it with 0 or a small number
|
||||||
|
df['diff'] = df['diff'].fillna(0)
|
||||||
|
|
||||||
|
# Filter the DataFrame:
|
||||||
|
# 1. Exclude temperature values that are too high (>50) or too low (<-10)
|
||||||
|
# 2. Exclude rows where the difference from the previous reading is greater than 6
|
||||||
|
filtered_df = df[(df['value'] > 0) & (df['value'] < 50) & (df['diff'] <= 6)]
|
||||||
|
|
||||||
|
# Drop the 'diff' column as it's no longer needed after filtering
|
||||||
|
filtered_df = filtered_df.drop(columns=['diff'])
|
||||||
|
|
||||||
|
return filtered_df
|
||||||
|
|
||||||
|
|
||||||
|
def generate_simplex(start_time=None, end_time=None, interval=600, max_temp=30, min_temp=10, frequency=10):
|
||||||
|
# Default time settings if none provided
|
||||||
|
if end_time is None:
|
||||||
|
end_time = datetime.datetime.now()
|
||||||
|
if start_time is None:
|
||||||
|
start_time = end_time - datetime.timedelta(days=1)
|
||||||
|
|
||||||
|
# Calculate the number of samples needed based on the interval
|
||||||
|
total_seconds = int((end_time - start_time).total_seconds())
|
||||||
|
steps = total_seconds // interval
|
||||||
|
|
||||||
|
# Time array
|
||||||
|
times = [start_time + datetime.timedelta(seconds=i * interval) for i in range(steps + 1)]
|
||||||
|
|
||||||
|
# Simplex noise generator
|
||||||
|
simplex = OpenSimplex(seed=np.random.randint(0, 1000))
|
||||||
|
|
||||||
|
# Generate noise values and scale them
|
||||||
|
temperatures = [simplex.noise2(x=i / frequency, y=0) for i in range(steps + 1)]
|
||||||
|
|
||||||
|
# Map Simplex noise output (usually in range [-1, 1]) to the [min_temp, max_temp]
|
||||||
|
scaled_temperatures = min_temp + (np.array(temperatures) + 1) / 2 * (max_temp - min_temp)
|
||||||
|
|
||||||
|
# Create DataFrame
|
||||||
|
df = pd.DataFrame({'time': times, 'value': scaled_temperatures})
|
||||||
|
return df
|
||||||
36
main.py
Normal file
36
main.py
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from generate_data import *
|
||||||
|
from analyze import *
|
||||||
|
from poll import *
|
||||||
|
|
||||||
|
def plot_temperature_data(df, recent_count=None):
|
||||||
|
plt.figure(figsize=(10, 5))
|
||||||
|
|
||||||
|
# Check if recent_count is specified and valid
|
||||||
|
if recent_count is not None and recent_count > 0:
|
||||||
|
df = df.tail(recent_count) # Slice the DataFrame to get the last 'recent_count' rows
|
||||||
|
|
||||||
|
plt.plot(df['time'], df['value'], label='Temperature', color='tab:red')
|
||||||
|
plt.title('Temperature Over Time')
|
||||||
|
plt.xlabel('Time')
|
||||||
|
plt.ylabel('Temperature (°C)')
|
||||||
|
plt.grid(True)
|
||||||
|
plt.legend()
|
||||||
|
plt.xticks(rotation=45) # Rotates the x-axis labels to make them more readable
|
||||||
|
plt.tight_layout() # Adjusts subplot params so that the subplot(s) fits in to the figure area.
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# Load the data from the CSV file
|
||||||
|
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
|
plot_temperature_data(df)
|
||||||
|
df2 = sample_every_kth_point(df,50)
|
||||||
|
|
||||||
|
diff1 = distribution_of_differences(df, 'value')
|
||||||
|
diff2 = distribution_of_differences(df2, 'value')
|
||||||
|
|
||||||
|
diff1 = diff1[diff1 <= 10]
|
||||||
|
diff2 = diff2[diff2 <= 10]
|
||||||
|
|
||||||
|
plot_histogram(diff1,bins=20, title='Distribution of Absolute Differences (Original Data)')
|
||||||
|
plot_histogram(diff2, bins=20, title='Distribution of Absolute Differences (Sampled Data)')
|
||||||
10
poll.py
Normal file
10
poll.py
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
def sample_every_kth_point(df, k):
|
||||||
|
# Validate the input to ensure k is positive and does not exceed the DataFrame length
|
||||||
|
if k <= 0:
|
||||||
|
raise ValueError("k must be a positive integer.")
|
||||||
|
if k > len(df):
|
||||||
|
raise ValueError("k is greater than the number of rows in the DataFrame.")
|
||||||
|
|
||||||
|
# Sample every k-th point
|
||||||
|
sampled_df = df.iloc[::k]
|
||||||
|
return sampled_df
|
||||||
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
pandas
|
||||||
|
noise
|
||||||
Reference in New Issue
Block a user