import datetime from analyze import hourly_rate_of_change def sample_every_kth_point(df, k): """ Sample every k-th point from a DataFrame. Parameters: - df: pandas DataFrame The DataFrame from which to sample the points. - k: int The interval between sampled points. Returns: - sampled_df: pandas DataFrame The DataFrame containing the sampled points. Raises: - ValueError: If k is not a positive integer or if k exceeds the number of rows in the DataFrame. """ # Validate the input to ensure k is positive and does not exceed the DataFrame length if k <= 0: raise ValueError("k must be a positive integer.") if k > len(df): raise ValueError("k is greater than the number of rows in the DataFrame.") # Sample every k-th point sampled_df = df.iloc[::k] return sampled_df def optimal_sample(df, threshold_dT=0.5): """ Returns a subset of the input DataFrame `df` containing rows that have a significant change in value. Parameters: df (pandas.DataFrame): The input DataFrame. threshold_dT (float, optional): The threshold value for the change in value. Defaults to 0.5. Returns: pandas.DataFrame: A subset of the input DataFrame `df` containing rows with significant changes in value. """ t0 = df["time"].iloc[0] indices = [0] times = [t0] for i in range(1, len(df)): dT = abs(df["value"].iloc[i] - df["value"].iloc[indices[-1]]) if dT > threshold_dT: times.append(i) indices.append(i) return df.iloc[indices] def sample_reglin(df, max_dT=0.5, max_poll_interval=2 * 3600): """ Returns a subset of the input DataFrame `df` by sampling points based on a linear regression algorithm. Parameters: - df (pandas.DataFrame): The input DataFrame containing the time series data. - max_dT (float): The value difference that should be considered significant enough to add a new value. Defaults to 0.5. - max_poll_interval (int): The maximum time interval allowed between the first and last point in the subset. Defaults to 2 hours (2 * 3600 seconds). Returns: - pandas.DataFrame: A subset of the input DataFrame `df` containing the sampled points. Raises: - ValueError: If there is no point before the specified date. """ indices = [] def get_first_point_after(date): if df[df['time'] > date].empty: raise ValueError("No point before the date") return df[df['time'] > date].iloc[0] # Get first two points t0 = df["time"].iloc[0] t1 = df["time"].iloc[1] while True: v0 = df[df["time"] == t0]["value"].values[0] v1 = df[df["time"] == t1]["value"].values[0] # Calculate the slope s = abs((v1 - v0) / (t1 - t0).total_seconds()) # Add max_dT/s to t1 new_t = t1 + datetime.timedelta(seconds=min(max_dT / s, max_poll_interval)) try: new_t = get_first_point_after(new_t)["time"] indices.append(df[df["time"] == new_t].index[0]) t0 = t1 t1 = new_t except ValueError: break return df.loc[indices] def sample_avg_rate_of_change(df, poll_rate): """ Calculate the sample average rate of change for a given DataFrame. Args: df (pandas.DataFrame): The DataFrame containing the data. poll_rate (pandas.Series): The Series containing the poll rates for each hour. Returns: pandas.DataFrame: The subset of the DataFrame with the indices where the rate of change exceeds the poll rate. """ indices = [0] for i in range(len(df)): current_hour = df["time"].iloc[i].hour if df["time"].iloc[i] - df["time"].iloc[indices[-1]] > datetime.timedelta(seconds=poll_rate.iloc[current_hour]): indices.append(i) return df.iloc[indices]