LSTM (test outsample)¶
In [1]:
import pandas as pd
import numpy as np
import os, sys
from pathlib import Path
import torch
from datetime import datetime
import logging
import mlflow
import warnings
# Set up paths and logging
root_folder_path = Path('/home/ytli/research')
experiment_folder_path = Path('/home/ytli/research/lstm')
sys.path.append(str(root_folder_path))
sys.path.append(str(experiment_folder_path))
from modules.study_multivar import RealDataTimeSeriesAnalysis, calculate_mse_by_subject_feature
from modules.plot import plot_subject_feature_html_interactive
In [2]:
study_name = "study5-realdata"
folder_path = "fisher"
method_name = "lstm"
real_data_path = experiment_folder_path / study_name / folder_path
datafile_path = "data/fisher_all.csv"
# Initialize the analysis
analysis = RealDataTimeSeriesAnalysis(study_name, folder_path, datafile_path)
# Load and prepare data
# feature_cols = ['energetic','enthusiastic','content','irritable','restless','worried','guilty','afraid','anhedonia','angry','hopeless','down','positive','fatigue','tension','concentrate','ruminate','avoid_act','reassure','procrast','avoid_people']
feature_cols = [
'down', 'positive', 'content', 'enthusiastic', 'energetic',
'hopeless', 'angry', 'irritable', 'reassure'
]
df = analysis.load_data()
subject_ids = df['subject_id'].unique()
In [3]:
with open(real_data_path / 'data' / 'random_train_subject_list.txt', 'r') as f:
random_train_subject_list = [line.strip() for line in f.readlines()]
with open(real_data_path / 'data' / 'random_test_subject_list.txt', 'r') as f:
random_test_subject_list = [line.strip() for line in f.readlines()]
print("random_train_subject_list", random_train_subject_list)
print("random_test_subject_list", random_test_subject_list)
random_train_subject_list ['p007', 'p040', 'p072', 'p033', 'p137', 'p203', 'p100', 'p010', 'p204', 'p215', 'p001', 'p008', 'p019', 'p169', 'p021', 'p014', 'p074', 'p139', 'p111', 'p048'] random_test_subject_list ['p004', 'p075', 'p117', 'p025', 'p113', 'p145', 'p163', 'p006', 'p217', 'p127', 'p012', 'p160', 'p115', 'p037', 'p013', 'p023', 'p202', 'p009', 'p068', 'p003']
In [4]:
sliding_windows_dict = analysis.create_sliding_windows(df, feature_cols, window_size=5, stride=1)
train_sliding_windows_dict = {}
test_sliding_windows_dict = {}
for subject_id in random_train_subject_list:
train_sliding_windows_dict[subject_id] = sliding_windows_dict[subject_id]
for subject_id in random_test_subject_list:
test_sliding_windows_dict[subject_id] = sliding_windows_dict[subject_id]
train_loader, val_loader, _ = analysis.prepare_datasets(train_sliding_windows_dict)
test_loader, _, _ = analysis.prepare_datasets(test_sliding_windows_dict)
# Train model
print("\033[1m\033[95mTraining model...\033[0m")
best_model, best_model_checkpoint_metrics, test_results = analysis.train_model(train_loader, val_loader, test_loader)
print(f"\033[1m\033[96mBEST MODEL VAL LOSS:\033[0m {best_model_checkpoint_metrics['val_loss']:.5f}")
print(f"\033[1m\033[96mTEST RESULTS:\033[0m loss = {test_results[0]['test_loss']:.5f}")
# Evaluate and visualize
print("\033[1m\033[95mEvaluating model on test data...\033[0m")
test_eval_data = analysis.evaluate_model(best_model, test_loader)
print("="*80)
2025/05/22 10:47:01 WARNING mlflow.utils.autologging_utils: MLflow pytorch autologging is known to be compatible with 1.9.0 <= torch <= 2.6.0, but the installed version is 2.6.0+cu124. If you encounter errors during autologging, try upgrading / downgrading torch to a compatible version, or try upgrading MLflow. INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Training model...
2025/05/22 10:47:10 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Downloading artifacts: 0%| | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 0%| | 0/1 [00:00<?, ?it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0] /home/ytli/miniconda3/envs/research/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:476: Your `test_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
🏃 View run fisher at: http://localhost:8093/#/experiments/92/runs/c5ec109970594df3a7e975e5d78a2619 🧪 View experiment at: http://localhost:8093/#/experiments/92 BEST MODEL VAL LOSS: 0.72193 TEST RESULTS: loss = 1.73910 Evaluating model on test data... ================================================================================
In [5]:
plot_subject_feature_html_interactive(test_eval_data, feature_cols)
In [6]:
mse_df = calculate_mse_by_subject_feature(test_eval_data, feature_cols)
mse_df.to_csv(f'mse-generalize.csv', index=False)
mse_df.head(20)
Out[6]:
| subject_id | feature_name | mse | |
|---|---|---|---|
| 0 | p004 | down | 1.064111 |
| 1 | p004 | positive | 1.522837 |
| 2 | p004 | content | 1.389245 |
| 3 | p004 | enthusiastic | 1.459119 |
| 4 | p004 | energetic | 1.225313 |
| 5 | p004 | hopeless | 1.329111 |
| 6 | p004 | angry | 0.936798 |
| 7 | p004 | irritable | 1.166241 |
| 8 | p004 | reassure | 1.024458 |
| 9 | p075 | down | 0.941535 |
| 10 | p075 | positive | 1.033084 |
| 11 | p075 | content | 1.044805 |
| 12 | p075 | enthusiastic | 0.889902 |
| 13 | p075 | energetic | 1.104642 |
| 14 | p075 | hopeless | 0.988972 |
| 15 | p075 | angry | 0.815556 |
| 16 | p075 | irritable | 1.263785 |
| 17 | p075 | reassure | 0.749723 |
| 18 | p117 | down | 0.959132 |
| 19 | p117 | positive | 1.213930 |
In [ ]: