MCOS求解w的过程一共包含了五个步骤：

1、估计均值和方差： 以为参数生成矩阵,计算矩阵的均值和方差

```import numpy as np,pandas as pd
from sklearn.covariance import LedoitWolf
def simCovMu(mu0,cov0,nObs,shrink=False):
x=np.random.multivariate_normal(mu0.flatten(),cov0,size=nObs)
mu1=x.mean(axis=0).reshape(-1,1)
if shrink:cov1=LedoitWolf().fit(x).covariance_
else:cov1=np.cov(x,rowvar=0)
return mu1,cov1```

2、去噪音： 这一步为了解决上文提到的由于噪音带来的不稳定性。首先用KDE算法，将特征值进行Marcenko-Pastur 分布拟合。这样就能够将噪音相关的特征值从信号相关的特征值分离出来。

```from sklearn.neighbors.kde import KernelDensity
from scipy.optimize import minimize
def fitKDE(obs, bWidth=.25, kernel='gaussian', x=None):
# Fit kernel to a series of obs, and derive the prob of obs
# x is the array of values on which the fit KDE will be evaluated
if len(obs.shape)==1: obs=obs.reshape(-1,1)
kde=KernelDensity(kernel=kernel, bandwidth=bWidth).fit(obs)
if x is None: x=np.unique(obs).reshape(-1,1)
if len(x.shape)==1:  x=x.reshape(-1,1)
logProb=kde.score_samples(x) # log(density)
pdf=pd.Series(np.exp(logProb), index=x.flatten())
return pdf
#------------------------------------------------------------------------------
def mpPDF(var,q,pts):
# Marcenko-Pastur pdf
# q=T/N
eMin,eMax=var*(1-(1./q)**.5)**2,var*(1+(1./q)**.5)**2
eVal=np.linspace(eMin,eMax,pts)
pdf=q/(2*np.pi*var*eVal)*((eMax-eVal)*(eVal-eMin))**.5
pdf=pd.Series(pdf,index=eVal)
return pdf
#------------------------------------------------------------------------------
def errPDFs(var,eVal,q,bWidth,pts=1000):
# Fit error
pdf0=mpPDF(var,q,pts) # theoretical pdf
pdf1=fitKDE(eVal,bWidth,x=pdf0.index.values) # empirical pdf
sse=np.sum((pdf1-pdf0)**2)
return sse
#------------------------------------------------------------------------------
def findMaxEval(eVal,q,bWidth):
# Find max random eVal by fitting Marcenko's dist to the empirical one
out=minimize(lambda *x:errPDFs(*x),.5,args=(eVal,q,bWidth),
bounds=((1E-5,1-1E-5),))
if out['success']: var=out['x'][0]
else: var=1
eMax=var*(1+(1./q)**.5)**2
return eMax,var

#------------------------------------------------------------------------------
def corr2cov(corr,std):
cov=corr*np.outer(std,std)
return cov
#------------------------------------------------------------------------------
def cov2corr(cov):
# Derive the correlation matrix from a covariance matrix
std=np.sqrt(np.diag(cov))
corr=cov/np.outer(std,std)
corr[corr<-1],corr[corr>1]=-1,1 # numerical error
return corr
#------------------------------------------------------------------------------
def getPCA(matrix):
# Get eVal,eVec from a Hermitian matrix
eVal,eVec=np.linalg.eigh(matrix)
indices=eVal.argsort()[::-1] # arguments for sorting eVal desc
eVal,eVec=eVal[indices],eVec[:,indices]
eVal=np.diagflat(eVal)
return eVal,eVec
#------------------------------------------------------------------------------
def denoisedCorr(eVal,eVec,nFacts):
# Remove noise from corr by fixing random eigenvalues
eVal_=np.diag(eVal).copy()
eVal_[nFacts:]=eVal_[nFacts:].sum()/float(eVal_.shape[0]-nFacts)
eVal_=np.diag(eVal_)
corr1=np.dot(eVec,eVal_).dot(eVec.T)
corr1=cov2corr(corr1)
return corr1
#------------------------------------------------------------------------------
def deNoiseCov(cov0,q,bWidth):
corr0=cov2corr(cov0)
eVal0,eVec0=getPCA(corr0)
eMax0,var0=findMaxEval(np.diag(eVal0),q,bWidth)
nFacts0=eVal0.shape[0]-np.diag(eVal0)[::-1].searchsorted(eMax0)
corr1=denoisedCorr(eVal0,eVec0,nFacts0)
cov1=corr2cov(corr1,np.diag(cov0)**.5)
return cov1```

3、最优化： 根据各种方法计算最优权重，比如CVO或者上文提到的NCO，NCO的代码如下。NCO的方法能够控制信号带来的不稳定性，具体步骤如下：

```from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
#------------------------------------------------------------------------------
def clusterKMeansBase(corr0,maxNumClusters=None,n_init=10):
dist, silh=((1-corr0.fillna(0))/2.)**.5,pd.Series() # distance matrix
if maxNumClusters is None:
maxNumClusters=corr0.shape[0]/2
for init in range(n_init):
for i in xrange(2,maxNumClusters+1): # find optimal num clusters
kmeans_=KMeans(n_clusters=i,n_jobs=1,n_init=1)
kmeans_=kmeans_.fit(dist)
silh_=silhouette_samples(dist,kmeans_.labels_)
stat=(silh_.mean()/silh_.std(),silh.mean()/silh.std())
if np.isnan(stat[1]) or stat[0]>stat[1]:
silh,kmeans=silh_,kmeans_
newIdx=np.argsort(kmeans.labels_)
corr1=corr0.iloc[newIdx] # reorder rows
corr1=corr1.iloc[:,newIdx] # reorder columns
clstrs={i:corr0.columns[np.where(kmeans.labels_==i)[0]].tolist() for \
i in np.unique(kmeans.labels_)} # cluster members
silh=pd.Series(silh,index=dist.index)
return corr1,clstrs,silh
#------------------------------------------------------------------------------
def optPort(cov,mu=None):
inv=np.linalg.inv(cov)
ones=np.ones(shape=(inv.shape[0],1))
if mu is None:mu=ones
w=np.dot(inv,mu)
w/=np.dot(ones.T,w)
return w
#------------------------------------------------------------------------------
def optPort_nco(cov,mu=None,maxNumClusters=None):
cov=pd.DataFrame(cov)
if mu is not None:mu=pd.Series(mu[:,0])
corr1=cov2corr(cov)
corr1,clstrs,_=clusterKMeansBase(corr1,maxNumClusters,n_init=10)
wIntra=pd.DataFrame(0,index=cov.index,columns=clstrs.keys())
for i in clstrs:
cov_=cov.loc[clstrs[i],clstrs[i]].values
mu_=(None if mu is None else mu.loc[clstrs[i]].values.reshape(-1,1))
wIntra.loc[clstrs[i],i]=optPort(cov_,mu_).flatten()
cov_=wIntra.T.dot(np.dot(cov,wIntra)) # reduce covariance matrix
mu_=(None if mu is None else wIntra.T.dot(mu))
wInter=pd.Series(optPort(cov_,mu_).flatten(),index=cov_.index)
nco=wIntra.mul(wInter,axis=1).sum(axis=1).values.reshape(-1,1)
return nco```

4、蒙特卡罗模拟： 结合以上所有步骤，进行多次模拟计算，步骤1每次模拟的，都计算出对应的最优解

```def monteCarlo(mu0,cov0,nObs,nSims,bWidth,minVarPortf,shrink):
w1=pd.DataFrame(columns=xrange(cov0.shape[0]),
index=xrange(nSims),dtype=float)
w1_d=w1.copy(deep=True)
for i in range(nSims):
mu1,cov1=simCovMu(mu0,cov0,nObs,shrink)
if minVarPortf: mu1=None
if bWidth>0: cov1=deNoiseCov(cov1,nObs*1./cov1.shape[1],bWidth)
w1.loc[i]=optPort(cov1,mu1).flatten()
w1_d.loc[i]=optPort_nco(cov1,mu1,int(cov1.shape[0]/2)).flatten()
return```

5、误差评估： 把步骤4计算的与使用原始均值方差计算出的最优权重进行比较，计算误差，误差的定义可以是以下定义之一，或其他任何合理的定义：

a. 均值误差：

b. 方差误差：

c. 夏普误差：

`https://github.com/enjine-com/mcos`

```import numpy as np
import pandas as pd
from mcos import optimizer
from mcos import observation_simulator
from mcos import mcos
from mcos.error_estimator import ExpectedOutcomeErrorEstimator, SharpeRatioErrorEstimator, \
VarianceErrorEstimator
from mcos.covariance_transformer import DeNoiserCovarianceTransformer, AbstractCovarianceTransformer
from mcos.observation_simulator import AbstractObservationSimulator, MuCovLedoitWolfObservationSimulator, \
MuCovObservationSimulator
from pypfopt.expected_returns import mean_historical_return
from pypfopt.risk_models import sample_cov
import warnings
warnings.filterwarnings('ignore')
# Create dataframe of price history to use for expected returns and covariance
def prices_df() -> pd.DataFrame:
tickers = ['goog','baba', 'amzn', 'wmt', 'glpi', 'bac', 'uaa', 'shld', 'jpm', 'sbux', 'amd', 'aapl','bby',
'ge', 'rrc', 'ma','fb']
total_df = pd.DataFrame()
for id in tickers:
temp = pd.read_csv( id + '.us.txt', parse_dates=True, index_col='Date')
temp = pd.DataFrame(temp['Close']).rename(columns={"Close":id})
if total_df.empty:
total_df = temp
else:
total_df = total_df.join(temp)
# Choose the number of simulations to run
num_sims = 50
# Select the optimizers that you would like to compare
op = [optimizer.HRPOptimizer(), optimizer.MarkowitzOptimizer(),optimizer.NCOOptimizer(), optimizer.RiskParityOptimizer()]
# select the metric to use for comparison
ee = ExpectedOutcomeErrorEstimator()
# select your optional covariance transformer
cov_trans = DeNoiserCovarianceTransformer()
# convert price history to expected returns and covariance matrix
mu = mean_historical_return(prices_df()).values
cov = sample_cov(prices_df()).values
obs_sim = MuCovObservationSimulator(mu, cov, num_sims)
# Run the simulation
results = mcos.simulate_optimizations(obs_sim, num_sims, op, ee, [cov_trans])
print(results)
results.plot.bar()```