Basic regression in pytorch

Basic regression in pytorch#

import pandas as pd
import torch
import statsmodels.formula.api as smf
import statsmodels as sm
import seaborn as sns
import matplotlib.pyplot as plt

## Read in the data and display a few rows
dat = pd.read_csv("https://raw.githubusercontent.com/bcaffo/ds4bme_intro/master/data/oasis.csv")
dat.head(4)

	FLAIR	PD	T1	T2	FLAIR_10	PD_10	T1_10	T2_10	FLAIR_20	PD_20	T1_20	T2_20
0	1.143692	1.586219	-0.799859	1.634467	0.437568	0.823800	-0.002059	0.573663	0.279832	0.548341	0.219136	0.298662
1	1.652552	1.766672	-1.250992	0.921230	0.663037	0.880250	-0.422060	0.542597	0.422182	0.549711	0.061573	0.280972
2	1.036099	0.262042	-0.858565	-0.058211	-0.044280	-0.308569	0.014766	-0.256075	-0.136532	-0.350905	0.020673	-0.259914
3	1.037692	0.011104	-1.228796	-0.470222	-0.013971	-0.000498	-0.395575	-0.221900	0.000807	-0.003085	-0.193249	-0.139284

sns.scatterplot(x = dat['T2'], y = dat['PD'])

<Axes: xlabel='T2', ylabel='PD'>

_images/8c777517d8c4eba367adea4bb9e7dbdeca1f056e4b3be6cdeddac234c9ddd4d0.png

fit = smf.ols('PD ~ T2', data = dat).fit()
fit.summary()

OLS Regression Results
Dep. Variable:	PD	R-squared:	0.661
Model:	OLS	Adj. R-squared:	0.657
Method:	Least Squares	F-statistic:	190.9
Date:	Mon, 29 Jan 2024	Prob (F-statistic):	9.77e-25
Time:	07:02:58	Log-Likelihood:	-57.347
No. Observations:	100	AIC:	118.7
Df Residuals:	98	BIC:	123.9
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	0.3138	0.052	6.010	0.000	0.210	0.417
T2	0.7832	0.057	13.815	0.000	0.671	0.896

Omnibus:	1.171	Durbin-Watson:	1.501
Prob(Omnibus):	0.557	Jarque-Bera (JB):	0.972
Skew:	0.241	Prob(JB):	0.615
Kurtosis:	2.995	Cond. No.	1.89

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# The in sample predictions
yhat = fit.predict(dat['T2'])

# Make sure that it's adding the intercept
#test = 0.3138 + dat['T2'] * 0.7832
#sns.scatterplot(yhat,test)

## A plot of the in sample predicted values
## versus the actual outcomes
sns.scatterplot(x = yhat, y = dat['PD'])
plt.plot([-1, 3], [-1, 3], linewidth=2)

[<matplotlib.lines.Line2D at 0x783ee16ecca0>]

_images/b47ab6c422d04c41ec639f4ea1e5a7f8864c96e1da1dc8f0f704d04569e74b20.png

n = dat.shape[0]

## Get the y and x from 
xtraining = torch.from_numpy(dat['T2'].values)
ytraining = torch.from_numpy(dat['PD'].values)

## PT wants floats
xtraining = xtraining.float()
ytraining = ytraining.float()

## Dimension is 1xn not nx1
## squeeze the second dimension
xtraining = xtraining.unsqueeze(1)
ytraining = ytraining.unsqueeze(1)

## Show that everything is the right size
[xtraining.shape, 
 ytraining.shape,
 [n, 1]
 ]

[torch.Size([100, 1]), torch.Size([100, 1]), [100, 1]]

## Show that linear regression is a pytorch 
model = torch.nn.Sequential(
    torch.nn.Linear(1, 1)
)

## MSE is the loss function
loss_fn = torch.nn.MSELoss(reduction='sum')

## Set the optimizer
## There are lots of choices
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

## Loop over iterations
for t in range(10000):

    ## Forward propagation
  y_pred = model(xtraining)
    
  ## the loss for this interation
  loss = loss_fn(y_pred, ytraining)

  #print(t, loss.item() / n)

  ## Zero out the gradients before adding them up 
  optimizer.zero_grad()
  
  ## Backprop
  loss.backward()
  
  ## Optimization step
  optimizer.step()

ytest = model(xtraining).detach().numpy().reshape(-1)
sns.scatterplot(x = ytest, y = yhat)
plt.plot([-1, 3], [-1, 3], linewidth=2)

[<matplotlib.lines.Line2D at 0x783ee986a710>]

_images/b87450d9681e4cd0840eb112c51a3618db33fe050e264277cf1870fa45e9b545.png

for param in model.parameters():
  print(param.data)

tensor([[0.7831]])
tensor([0.3138])