import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
import cartopy.crs as ccrs
import cartopy.feature as cfeature

PERCENTILE = 50
WIND_THRESH = 64

# Column names for STORMS data
colnames = ['sim', 'month', 't', 'i', 'basin', 'lat', 'lon', 'min_press', 'wind', 'radius', 'CAT', 'landfall', 'dist_land']

# Read and concatenate STORMS data files (0-9) to create 'all' file
print("Reading STORMS data files...")
storms_data = []

for file_num in range(10):  # Files 0 through 9
    filename = f'STORM/STORM_DATA_IBTRACS_NA_1000_YEARS_{file_num}.txt'
    print(f"Reading {filename}...")
    
    # Read the data file
    df = pd.read_csv(filename, header=None, names=colnames, skipinitialspace=True)
    
    # Add file offset to simulation number to make them unique across files
    df['sim'] = df['sim'] + (file_num * 1000)  # Offset by 1000*file_num to avoid sim number conflicts
    
    storms_data.append(df)

# Concatenate all data
hits = pd.concat(storms_data, ignore_index=True)
print(f"Total records loaded: {len(hits)}")

# Convert wind speed from m/s to knots (1 m/s = 1.94384 knots)
hits['wind'] = hits['wind'] * 1.94384

# Convert longitude from 0-360 to -180-180 format for Atlantic basin
hits['lon'] = np.where(hits['lon'] > 180, hits['lon'] - 360, hits['lon'])

# Filter for North Atlantic basin (basin = 1) 
hits = hits[hits['basin'] == 1].copy()

# Create time column and extract year (using sim as year proxy since each sim represents a season)
# For STORMS data, we'll use 'sim' as the year identifier since each simulation represents one season
hits['year'] = hits['sim']

print(f"Records after filtering for NA basin: {len(hits)}")
print(f"Number of unique simulations (years): {hits['year'].nunique()}")

# Observed Data (keep original observed data processing unchanged)
main1 = pd.read_csv('main1_NA.csv')  # ...header/dtype as before
main1['time2'] = pd.to_datetime(main1['time'])
main1['year'] = main1['time2'].dt.year
main1 = main1[(main1['year'] >= 1944) & (main1['year'] <= 2024)]
num_unique_years = main1['year'].nunique()

lon_bins = np.arange(-100, 12, 2)
lat_bins = np.arange(4, 52, 2)

obs_hit = np.zeros((len(lat_bins)-1, len(lon_bins)-1))
for year in main1['year'].unique():
    year_data = main1[main1['year'] == year]
    # For each grid box, what is the max wind this year?
    H_max, _, _ = np.histogram2d(year_data['lat'], year_data['lon'],
                                 bins=[lat_bins, lon_bins],
                                 weights=year_data['wind'])
    # Where was max wind >= WIND_THRESH?
    hit_mask = (H_max >= WIND_THRESH)
    obs_hit += hit_mask.astype(int)
# Calculate probability (fraction of years)
obs_prob = obs_hit / num_unique_years

# Simulated Data (adapted for STORMS data format)
sim_values = hits['year'].unique()  # Use 'year' (which is sim number) as identifier
num_sims_to_sample = num_unique_years
num_draws = 100
sim_prob_samples = []

print(f"Processing {num_draws} random samples of {num_sims_to_sample} simulations each...")

for draw in range(num_draws):
    if (draw + 1) % 20 == 0:
        print(f"Completed {draw + 1}/{num_draws} draws")
    
    selected_sims = np.random.choice(sim_values, size=num_sims_to_sample, replace=False)
    sim_hit = np.zeros((len(lat_bins)-1, len(lon_bins)-1))
    
    for sim in selected_sims:
        sim_data = hits[hits['year'] == sim]
        H_max, _, _ = np.histogram2d(sim_data['lat'], sim_data['lon'],
                                     bins=[lat_bins, lon_bins],
                                     weights=sim_data['wind'])
        hit_mask = (H_max >= WIND_THRESH)
        sim_hit += hit_mask.astype(int)
    
    sim_prob = sim_hit / num_sims_to_sample
    sim_prob_samples.append(sim_prob)

sim_prob_samples = np.stack(sim_prob_samples)
sim_prob_percentile = np.percentile(sim_prob_samples, PERCENTILE, axis=0)

print("Creating plots...")

# Plot: vmax=0.5 for color scale
fig = plt.figure(figsize=(16, 7), constrained_layout=True)
ax1 = fig.add_subplot(1, 2, 1, projection=ccrs.PlateCarree())
im1 = ax1.imshow(
    obs_prob, origin='lower', aspect='auto', cmap='jet',
    extent=[lon_bins[0], lon_bins[-1], lat_bins[0], lat_bins[-1]],
    norm=Normalize(vmin=0, vmax=0.5),
    transform=ccrs.PlateCarree()
)
ax1.set_title('Observed: Annual Max Wind ≥64 kt Probability')
ax1.set_xlabel('Longitude')
ax1.set_ylabel('Latitude')
ax1.set_extent([lon_bins[0], lon_bins[-1], lat_bins[0], lat_bins[-1]], crs=ccrs.PlateCarree())
ax1.add_feature(cfeature.COASTLINE.with_scale('50m'), edgecolor='white', linewidth=1.2)
ax1.add_feature(cfeature.LAND, facecolor='none', edgecolor='white', linewidth=1.2)

ax2 = fig.add_subplot(1, 2, 2, projection=ccrs.PlateCarree())
im2 = ax2.imshow(
    sim_prob_percentile, origin='lower', aspect='auto', cmap='jet',
    extent=[lon_bins[0], lon_bins[-1], lat_bins[0], lat_bins[-1]],
    norm=Normalize(vmin=0, vmax=0.5),
    transform=ccrs.PlateCarree()
)
ax2.set_title(f'Simulated: Annual Max Wind ≥64 kt Probability (Percentile={PERCENTILE})')
ax2.set_xlabel('Longitude')
ax2.set_extent([lon_bins[0], lon_bins[-1], lat_bins[0], lat_bins[-1]], crs=ccrs.PlateCarree())
ax2.add_feature(cfeature.COASTLINE.with_scale('50m'), edgecolor='white', linewidth=1.2)
ax2.add_feature(cfeature.LAND, facecolor='none', edgecolor='white', linewidth=1.2)

cbar = fig.colorbar(im1, ax=[ax1, ax2], orientation='vertical', fraction=0.025, pad=0.04, label='Probability per Year (Annual Max Wind ≥64 kt)')

# Save both TIFF and JPG formats
plt.savefig('NAwind_prob_comparisonSTORM_64.tiff', format='tiff', dpi=300)
plt.savefig('NAwind_prob_comparisonSTORM_64.jpg', format='jpeg', dpi=300)
print("Plots saved as NAwind_prob_comparisonSTORM_64.tiff and NAwind_prob_comparisonSTORM_64.jpg")

plt.show()

print("Analysis complete!")
Reading STORMS data files...
Reading STORM/STORM_DATA_IBTRACS_NA_1000_YEARS_0.txt...
Reading STORM/STORM_DATA_IBTRACS_NA_1000_YEARS_1.txt...
Reading STORM/STORM_DATA_IBTRACS_NA_1000_YEARS_2.txt...
Reading STORM/STORM_DATA_IBTRACS_NA_1000_YEARS_3.txt...
Reading STORM/STORM_DATA_IBTRACS_NA_1000_YEARS_4.txt...
Reading STORM/STORM_DATA_IBTRACS_NA_1000_YEARS_5.txt...
Reading STORM/STORM_DATA_IBTRACS_NA_1000_YEARS_6.txt...
Reading STORM/STORM_DATA_IBTRACS_NA_1000_YEARS_7.txt...
Reading STORM/STORM_DATA_IBTRACS_NA_1000_YEARS_8.txt...
Reading STORM/STORM_DATA_IBTRACS_NA_1000_YEARS_9.txt...
Total records loaded: 3507558
Records after filtering for NA basin: 3507558
Number of unique simulations (years): 10000
Processing 100 random samples of 81 simulations each...
Completed 20/100 draws
Completed 40/100 draws
Completed 60/100 draws
Completed 80/100 draws
Completed 100/100 draws
Creating plots...
Plots saved as NAwind_prob_comparisonSTORM_64.tiff and NAwind_prob_comparisonSTORM_64.jpg

Analysis complete!