问题:数据特征(n)比样本数目(m)还大(矩阵X不是满秩矩阵),怎么办?前面的方法不行,因为计算\(X^TX^{-1}\)会出错
其他缩减方法:lasso、LAR、PCA回归、子集选择
lasso限定:\(\sum_{k=1}^{n}\|w_k\| <= lambda\),使用绝对值进行限定。当lambda足够小时,很多权重值会被限定为0,从而减少特征,更好的理解数。但是同时也增加了计算的复杂度。
标准回归(可直接矩阵求解):
# 注意判断矩阵是否可逆
def standRegres(xArr,yArr):
xMat = mat(xArr); yMat = mat(yArr).T
xTx = xMat.T*xMat
if linalg.det(xTx) == 0.0:
print "This matrix is singular, cannot do inverse"
return
ws = xTx.I * (xMat.T*yMat)
return ws
标准回归(statsmodels’ module OLS and sklearn):
import statsmodels.api as sm
X=df[df.columns].values
y=target['MEDV'].values
#add constant
X=sm.add_constant(X)
# build model
model=sm.OLS(y,X).fit()
prediction=model.predict(X)
print(model.summary())
from sklearn import linear_model
lm = linear_model.LinearRegression()
model = lm.fit(X,y)
y_pred = lm.predict(X)
lm.score(X,y)
lm.coef_
lm.intercept_<br><br>evaluation(y,y_pred)
局部加权线性回归:
def lwlr(testPoint,xArr,yArr,k=1.0):
xMat = mat(xArr); yMat = mat(yArr).T
m = shape(xMat)[0]
weights = mat(eye((m)))
for j in range(m): #next 2 lines create weights matrix
diffMat = testPoint - xMat[j,:] #
weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))
xTx = xMat.T * (weights * xMat)
if linalg.det(xTx) == 0.0:
print "This matrix is singular, cannot do inverse"
return
ws = xTx.I * (xMat.T * (weights * yMat))
return testPoint * ws
岭回归(使用缩减技术之前,先标准化不同的特征):
def ridgeRegres(xMat,yMat,lam=0.2):
xTx = xMat.T*xMat
denom = xTx + eye(shape(xMat)[1])*lam
if linalg.det(denom) == 0.0:
print "This matrix is singular, cannot do inverse"
return
ws = denom.I * (xMat.T*yMat)
return ws
def ridgeTest(xArr,yArr):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #to eliminate X0 take mean off of Y
#regularize X's
xMeans = mean(xMat,0) #calc mean then subtract it off
xVar = var(xMat,0) #calc variance of Xi then divide by it
xMat = (xMat - xMeans)/xVar
numTestPts = 30
wMat = zeros((numTestPts,shape(xMat)[1]))
for i in range(numTestPts):
ws = ridgeRegres(xMat,yMat,exp(i-10))
wMat[i,:]=ws.T
return wMat
岭回归(sklearn):
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=1, solver="cholesky")
ridge_reg.fit(X, y)
y_pred=ridge_reg.predict(X)
evaluation(y,y_pred,index_name='ridge_reg')
Lasso回归(sklearn):
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X, y)
y_pred=lasso_reg.predict(X)
evaluation(y,y_pred,index_name='lasso_reg')
前向逐步回归:
def stageWise(xArr,yArr,eps=0.01,numIt=100):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #can also regularize ys but will get smaller coef
xMat = regularize(xMat)
m,n=shape(xMat)
#returnMat = zeros((numIt,n)) #testing code remove
ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
for i in range(numIt):
print ws.T
lowestError = inf;
for j in range(n):
for sign in [-1,1]:
wsTest = ws.copy()
wsTest[j] += eps*sign
yTest = xMat*wsTest
rssE = rssError(yMat.A,yTest.A)
if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
#returnMat[i,:]=ws.T
#return returnMat
多项式回归(sklearn):
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4)
X_Poly = poly_reg.fit_transform(X)
lin_reg_2 =linear_model.LinearRegression()
lin_reg_2.fit(X_Poly, y)
y_pred=lin_reg_2.predict(poly_reg.fit_transform(X))
evaluation(y,y_pred,index_name=['poly_reg'])
弹性网络回归(sklearn):
enet_reg = linear_model.ElasticNet(l1_ratio=0.7)
enet_reg.fit(X,y)
y_pred=enet_reg.predict(X)
evaluation(y,y_pred,index_name='enet_reg ')