#!/usr/bin/env python
# coding: utf-8

# In[1]:


import os
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import scipy
import scipy.stats as st


# In[2]:


os.chdir('data/')


# In[3]:


files = os.listdir()


# In[4]:


print(files)


# In[5]:


for file in range(len(files)):
    if files[file].endswith('csv') == False:
        file_name = files[file]
        text_file = pd.read_csv(files[file], delim_whitespace=True)
        text_file.to_csv(f'{file_name[:-4]}.csv', index = None)
        os.remove(files[file])


# In[6]:


files = os.listdir()
print(files)
print(len(files))
values = []
for file in range(len(files)):
    index = 0
    name = files[file]
    val = ''
    if name.endswith('.csv'):
        while (True):
            if (name[index].isnumeric()) or (name[index] == '_'):
                if(name[index] == '_'):
                    val = val + '.'
                    index = index + 1
                else:
                    val = val + name[index]
                    index = index + 1
            else:
                break
        values.append(float(val))
print(values)
print(len(values))


# In[7]:


data_sets = []
for file in range(len(files)):
    if name.endswith('.csv'):
        data = pd.read_csv(files[file], header = None)
        data_sets.append(data)


# In[8]:


x = data_sets[0].iloc[:, 0]
y = data_sets[0].iloc[:, 1]
max_val = 0
max_index = 0
for i in range(len(list(x))):
    if x[i] >= 860 and x[i] <=900:
        if y[i] > max_val:
            max_val = y[i]
            max_index = i
print(max_val)
print(max_index)


# In[9]:


max_counts = []
for set in data_sets:
    max_counts.append(set[1][max_index])
print(max_counts)


# 
# 
# 
# 
# #### waterset = pd.read_csv('WatervBF.csv')
# ywater = waterset.iloc[:, 4]
# print(ywater)

# In[10]:


x = max_counts
x = np.array(x)
x = x
y = values
y = np.array(y)
plt.plot(x, y, "b*")
plt.xlabel("Percent fuel")
plt.ylabel("Intensity")
plt.grid()

plt.show()


# In[11]:


coeff = np.polyfit(x, y, 1)
print(coeff)

m = coeff[0]
b = coeff[1]


# In[12]:


Xtrend = np.linspace(x.min(), x.max(), 100)
Ytrend = m * Xtrend + b

plt.plot(x, y, "b*")

plt.plot(Xtrend, Ytrend, 'r-')
plt.plot(Xtrend, Ytrend - np.std(Ytrend), 'r-')
plt.plot(Xtrend, Ytrend + np.std(Ytrend), 'r-')
plt.grid()


# In[13]:


Ytilde = m * x + b
print(y)
print(Ytilde)

plt.plot(x, y, "b*")
plt.plot(Xtrend, Ytrend, 'r-')
plt.plot(x, Ytilde, "g*")
plt.grid()


# In[14]:


residuals = Ytilde - y
plt.plot(x, residuals)
plt.grid()


# In[15]:


r2 = 1 - np.sum(residuals**2) / np.sum((y - np.mean(y))**2)
print(r2)


# In[16]:


error_margin = 1.645 * (np.std(Ytrend) / np.sqrt(len(y)))
print(error_margin)


# In[17]:


x_test = 2786
y_test = m * x_test + b

Xtrend = np.linspace(x[0], x[-1], 100)
Ytrend = m * Xtrend + b

plt.figure(figsize=(15,8), dpi = 80)
plt.plot(x, y, "b*")

plt.plot(Xtrend, Ytrend, 'r-')
plt.plot(Xtrend, (Ytrend - error_margin), 'y-')
plt.plot(Xtrend, (Ytrend  + error_margin), 'y-')
plt.plot(x_test, y_test, 'go')
plt.grid()

print(f"Predicted Value: {y_test}")
print(f"Upper bound: {y_test + error_margin}")
print(f"Lower bound: {y_test - error_margin}")

#conf_int = [y_test - (scipy.stats.norm.ppf(.80) * np.std(Ytrend)), y_test + (scipy.stats.norm.ppf(.80) * np.std(Ytrend))]
#print(conf_int)


# In[25]:


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


# Cannot use Rank 1 matrix in scikit learn
X = x.reshape((len(x), 1))
Y = y.reshape((len(y), 1))
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# Creating a model
reg = LinearRegression(fit_intercept=True)

# Fitting training data
reg = reg.fit(X_train, y_train)

# Y Prediciton
y_pred = reg.predict(X_test)

# Calculating R2 Score
r2_score = reg.score(X_test, y_test)
print(f"Accuracy = {r2_score}")

coeffs = [reg.coef_, reg.intercept_]
m = np.double(coeffs[0])
b = np.double(coeffs[1])

print(f"{m}, {b}")

error_margin = 1.645 * (np.std((m * np.linspace(X.min(), X.max(), 100)) + b) / np.sqrt(len(Y)))

plt.figure(figsize=(15,10), dpi = 80)
plt.scatter(X, Y, label = "Sample Points")
plt.plot(np.linspace(X.min(), X.max(), 100), (m * np.linspace(X.min(), X.max(), 100)) + b, label = "Regression Line")
plt.plot(np.linspace(X.min(), X.max(), 100), (m * np.linspace(X.min(), X.max(), 100)) + (b + error_margin), c = 'y', label = "Confidence Band")
plt.plot(np.linspace(X.min(), X.max(), 100), (m * np.linspace(X.min(), X.max(), 100)) + (b - error_margin), c = 'y')
#plt.scatter(X_test, y_pred, c = 'orange', label = "Estimated Points")
#plt.scatter(X_test, y_test, c = 'green', label = "Actual Values")
plt.legend()


# In[19]:


rmse = np.sqrt(r2_score)
rmse


# In[20]:


for pt in range(len(x)):
    print(f"{x[pt]} , {y[pt]}")


# In[21]:


percents_unique = np.unique(y)
print(percents_unique)


# In[22]:


combined = np.vstack((y, x)).T


# In[23]:


print(combined)


# In[24]:


minmax = []
difference = []
for percents in percents_unique:
    max_count = 0
    min_count = 99999
    for i in range(len(combined)):
        if combined[i][0] == percents:
            if combined[i][1] > max_count:
                max_count = combined[i][1]
            if combined[i][1] < min_count:
                min_count = combined[i][1]
    minmax.append((max_count, min_count))
    difference.append(max_count - min_count)
    
for j in range(len(minmax)):
    print(f"{percents_unique[j]}, {minmax[j]}, {difference[j]}")


# In[ ]:


# In[ ]:


# In[ ]: