import pandas as pd
import numpy as np
import os, sys
from pathlib import Path
import torch
from datetime import datetime
import logging
import mlflow
import warnings

# Set up paths and logging
root_folder_path = Path('/home/ytli/research')
experiment_folder_path = Path('/home/ytli/research/lstm')
sys.path.append(str(root_folder_path))
sys.path.append(str(experiment_folder_path))

from modules.study_multivar import RealDataTimeSeriesAnalysis, calculate_mse_by_subject_feature
from modules.plot import plot_subject_feature_html_interactive

study_name = "study5-realdata"
folder_path = "fisher"
method_name = "lstm"
real_data_path = experiment_folder_path / study_name / folder_path
datafile_path = "data/fisher_all.csv"

# Initialize the analysis
analysis = RealDataTimeSeriesAnalysis(study_name, folder_path, datafile_path)

# Load and prepare data
# feature_cols = ['energetic','enthusiastic','content','irritable','restless','worried','guilty','afraid','anhedonia','angry','hopeless','down','positive','fatigue','tension','concentrate','ruminate','avoid_act','reassure','procrast','avoid_people']
feature_cols =  [
    'down', 'positive', 'content', 'enthusiastic', 'energetic',
    'hopeless', 'angry', 'irritable', 'reassure'
]

df = analysis.load_data()
subject_ids = df['subject_id'].unique()

with open(real_data_path / 'data' / 'random_train_subject_list.txt', 'r') as f:
    random_train_subject_list = [line.strip() for line in f.readlines()]

with open(real_data_path / 'data' / 'random_test_subject_list.txt', 'r') as f:
    random_test_subject_list = [line.strip() for line in f.readlines()]

print("random_train_subject_list", random_train_subject_list)
print("random_test_subject_list", random_test_subject_list)

random_train_subject_list ['p007', 'p040', 'p072', 'p033', 'p137', 'p203', 'p100', 'p010', 'p204', 'p215', 'p001', 'p008', 'p019', 'p169', 'p021', 'p014', 'p074', 'p139', 'p111', 'p048']
random_test_subject_list ['p004', 'p075', 'p117', 'p025', 'p113', 'p145', 'p163', 'p006', 'p217', 'p127', 'p012', 'p160', 'p115', 'p037', 'p013', 'p023', 'p202', 'p009', 'p068', 'p003']

sliding_windows_dict = analysis.create_sliding_windows(df, feature_cols, window_size=5, stride=1)

train_sliding_windows_dict = {} 
test_sliding_windows_dict = {}
for subject_id in random_train_subject_list:
    train_sliding_windows_dict[subject_id] = sliding_windows_dict[subject_id]
for subject_id in random_test_subject_list:
    test_sliding_windows_dict[subject_id] = sliding_windows_dict[subject_id]

train_loader, val_loader, _ = analysis.prepare_datasets(train_sliding_windows_dict)
test_loader, _, _ = analysis.prepare_datasets(test_sliding_windows_dict)

# Train model
print("\033[1m\033[95mTraining model...\033[0m")
best_model, best_model_checkpoint_metrics, test_results = analysis.train_model(train_loader, val_loader, test_loader)
print(f"\033[1m\033[96mBEST MODEL VAL LOSS:\033[0m {best_model_checkpoint_metrics['val_loss']:.5f}")
print(f"\033[1m\033[96mTEST RESULTS:\033[0m loss = {test_results[0]['test_loss']:.5f}")

# Evaluate and visualize
print("\033[1m\033[95mEvaluating model on test data...\033[0m")
test_eval_data = analysis.evaluate_model(best_model, test_loader)
print("="*80)

2025/05/22 10:47:01 WARNING mlflow.utils.autologging_utils: MLflow pytorch autologging is known to be compatible with 1.9.0 <= torch <= 2.6.0, but the installed version is 2.6.0+cu124. If you encounter errors during autologging, try upgrading / downgrading torch to a compatible version, or try upgrading MLflow.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

Training model...

2025/05/22 10:47:10 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/ytli/miniconda3/envs/research/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:476: Your `test_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.

🏃 View run fisher at: http://localhost:8093/#/experiments/92/runs/c5ec109970594df3a7e975e5d78a2619
🧪 View experiment at: http://localhost:8093/#/experiments/92
BEST MODEL VAL LOSS: 0.72193
TEST RESULTS: loss = 1.73910
Evaluating model on test data...
================================================================================

plot_subject_feature_html_interactive(test_eval_data, feature_cols)

mse_df = calculate_mse_by_subject_feature(test_eval_data, feature_cols)
mse_df.to_csv(f'mse-generalize.csv', index=False)
mse_df.head(20)

	subject_id	feature_name	mse
0	p004	down	1.064111
1	p004	positive	1.522837
2	p004	content	1.389245
3	p004	enthusiastic	1.459119
4	p004	energetic	1.225313
5	p004	hopeless	1.329111
6	p004	angry	0.936798
7	p004	irritable	1.166241
8	p004	reassure	1.024458
9	p075	down	0.941535
10	p075	positive	1.033084
11	p075	content	1.044805
12	p075	enthusiastic	0.889902
13	p075	energetic	1.104642
14	p075	hopeless	0.988972
15	p075	angry	0.815556
16	p075	irritable	1.263785
17	p075	reassure	0.749723
18	p117	down	0.959132
19	p117	positive	1.213930

LSTM (test outsample)¶