VAR¶
In [1]:
import pandas as pd
import os, sys
from pathlib import Path
from datetime import datetime
import logging
import mlflow
import warnings
import numpy as np
from statsmodels.tsa.api import VAR
from pathlib import Path
# Set up paths and logging
root_folder_path = Path('/home/ytli/research')
experiment_folder_path = Path('/home/ytli/research/lstm')
sys.path.append(str(root_folder_path))
sys.path.append(str(experiment_folder_path))
from modules.plot import plot_subject_feature_html_interactive
from modules.study_multivar import calculate_mse_by_subject_feature
In [2]:
# Set up paths
study_name = "study5-realdata"
folder_path = "fisher"
method_name = "lstm"
real_data_path = experiment_folder_path / study_name / folder_path
datafile_path = "data/fisher_all.csv"
# Load and prepare data
feature_cols = [
'down', 'positive', 'content', 'enthusiastic', 'energetic',
'hopeless', 'angry', 'irritable', 'reassure'
]
df = pd.read_csv(real_data_path / datafile_path)
df = df[['subject_id', 'time'] + feature_cols]
In [3]:
subject_data = {}
for subject_id in df['subject_id'].unique():
subject_df = df[df['subject_id'] == subject_id].copy()
subject_df = subject_df.sort_values('time')
subject_df = subject_df.reset_index(drop=True)
n_total = len(subject_df)
n_test = 10
n_val = 10
n_train = n_total - n_test - n_val
train_data = subject_df.iloc[:n_train]
val_data = subject_df.iloc[n_train:n_train+n_val]
test_data = subject_df.iloc[n_train+n_val:]
subject_data[subject_id] = {
'train': train_data,
'val': val_data,
'test': test_data
}
In [4]:
maxlags = 14
subject_models = {}
for subject_id, data in subject_data.items():
try:
model = VAR(data['train'][feature_cols])
results = model.fit() # maxlags=maxlags)
subject_models[subject_id] = results
except Exception as e:
print(f"Error fitting model for subject {subject_id}: {e}")
In [5]:
def generate_one_step_forecasts(model, data_series, feature_cols, nlags, test_points):
'''
Generates one-step-ahead forecasts for the given VAR model
Parameters:
- model: Fitted VAR model
- data_series: Full data series including both train and test
- feature_cols: List of feature column names
- nlags: Maximum lag used in the model
- test_points: Number of test points to forecast
Returns:
- Dictionary with predictions and actual values
'''
# Get the full data as numpy array
full_data = data_series[feature_cols].values
# Split train and test
train_end = len(full_data) - test_points
predictions = []
actual_values = []
# For each test point
for i in range(test_points):
target_pos = train_end + i # This is the position in the full data where we want to forecast
actual = full_data[target_pos]
actual_values.append(actual)
# Generate one-step forecast
lag_data = full_data[target_pos - nlags:target_pos]
# # Debug information
# print(f"Lag data shape for step {i}: {lag_data.shape}")
# print(f"Expected shape: ({nlags}, {len(feature_cols)})")
# Make sure we have the right shape for lag data
if lag_data.shape[0] != nlags:
print(f"Warning: Not enough lag data at step {i}")
raise ValueError(f"Expected lag data shape ({nlags}, {len(feature_cols)}), but got {lag_data.shape}")
forecast = model.forecast(lag_data, steps=1)[0]
predictions.append(forecast)
return {
'pred': predictions,
'target': actual_values
}
In [6]:
# Generate one-step-ahead forecasts for each subject
test_eval_data = {}
for subject_id, model in subject_models.items():
print(f"Generating forecasts for subject {subject_id}...")
try:
data = subject_data[subject_id]
full_data = pd.concat([data['train'], data['val'], data['test']])
test_data = data['test']
test_points = len(test_data)
# print(f"Model info: lag order={model.k_ar}, neqs={model.neqs}")
nlags = model.k_ar
if len(model.coefs) == 0:
print(f"Error: No coefficients in model for subject {subject_id}. Model may not be fitted properly.")
continue
results = generate_one_step_forecasts(model, full_data, feature_cols, nlags, test_points)
# Only save if we have predictions
if len(results['pred']) > 0:
test_eval_data[subject_id] = results
else:
print(f"No valid predictions generated for subject {subject_id}")
except Exception as e:
print(f"Error processing subject {subject_id}: {str(e)}")
Generating forecasts for subject p111... Generating forecasts for subject p025... Generating forecasts for subject p023... Generating forecasts for subject p217... Generating forecasts for subject p137... Generating forecasts for subject p202... Generating forecasts for subject p215... Generating forecasts for subject p163... Generating forecasts for subject p014... Generating forecasts for subject p008... Generating forecasts for subject p160... Generating forecasts for subject p117... Generating forecasts for subject p203... Generating forecasts for subject p019... Generating forecasts for subject p075... Generating forecasts for subject p072... Generating forecasts for subject p040... Generating forecasts for subject p074... Generating forecasts for subject p127... Generating forecasts for subject p007... Generating forecasts for subject p004... Generating forecasts for subject p013... Generating forecasts for subject p145... Generating forecasts for subject p012... Generating forecasts for subject p139... Generating forecasts for subject p001... Generating forecasts for subject p033... Generating forecasts for subject p010... Generating forecasts for subject p006... Generating forecasts for subject p009... Generating forecasts for subject p037... Generating forecasts for subject p169... Generating forecasts for subject p113... Generating forecasts for subject p048... Generating forecasts for subject p021... Generating forecasts for subject p003... Generating forecasts for subject p068... Generating forecasts for subject p100... Generating forecasts for subject p115... Generating forecasts for subject p204...
In [7]:
# Create the interactive plot
fig = plot_subject_feature_html_interactive(test_eval_data, feature_cols)
# Display the plot in the notebook
fig.show()
# Export to HTML file
output_file = "var_interactive_plot.html"
fig.write_html(output_file)
print(f"Plot exported to {output_file}")
Plot exported to var_interactive_plot.html
In [8]:
mse_df = calculate_mse_by_subject_feature(test_eval_data, feature_cols)
mse_df.to_csv("mse.csv", index=False)
In [ ]: