Press "Enter" to skip to content

k-means实现代码

k-means是一种
​迭代
求解的聚类算法,是将数据分为K组,k由人为指定,随机选取一个中心点
为初始的聚类中心,然后重复计算数据到之前 n 个聚类中心最远的距离,
把每个对象分配给距离它最近的聚类中心。聚类中心以及分配给它们的对象就代表一个
​聚类
。每分配一个样本,聚类的聚类中心会根据聚类中现有的对象被重新计算。这个过程将不断重复直到满足某个终止条件。终止条件可以是没有(或最小数目)对象被重新分配给不同的聚类,没有(或最小数目)聚类中心再发生变化,
​误差平方和
局部最小。

初始化聚类中心代码:

 

/**
   * k-means++初始化聚类中心
   */  def kmeansppInitial(points:List[UserFeatureBean],k: Int):Array[Vector[Double]] = {
    //数据集个数
    val pointsNum = points.length
    val random  = new Random()
    var kSum = 1
    var flag = true
    //选择第一个随机数(下标)
    val temp = random.nextInt(pointsNum)
    //保存随机下标号
    var array = new ListBuffer[Int]()
    //迭代添加元素的聚类中心数组
    var updatedCenters = new ListBuffer[Vector[Double]]()
    var sum = 0.0
    var randomSeed = 0.0
    //保存每个样本点对应到各自聚类中心的距离
    var pointsAndDist = List[Double]()
    var j = 0
    array = array :+ temp
    //将随机选择的点作为第一个聚类中心
    updatedCenters = updatedCenters :+ points(temp).getVector
    while(kSum < k ){
      //计算每个样本点与它所属的聚类中心的距离
      pointsAndDist = points.map(v => 
        vectorDis(v.getVector,closestCenter(updatedCenters.toArray,v.getVector))
      )
      sum = pointsAndDist.reduceLeft((a,b) => a + b)
      println("sum=="+ sum)
      flag = true
      while(flag){
        randomSeed = sum * (random.nextInt(100) + 1) / 100
        breakable{
          for(i <- 0 to pointsAndDist.length - 1){
            randomSeed -= pointsAndDist(i)
            if(randomSeed <= 0){
              j = i
              break
            }
          }
        }
        if(sum == 0.0){
          flag = false
          kSum += 1
        }else if(array.contains(j)){
          //求得的新中心点的下标在数组中存在
          flag= true
        }else{
          array = array :+ j
          updatedCenters = updatedCenters :+ points(j).getVector
          flag = false
          kSum += 1
        }
      }
    }
    //kmean++初始化中心点如下
    val centers = new Array[Vector[Double]](updatedCenters.length)
    for(i <- 0 to updatedCenters.length - 1){
      centers(i) = updatedCenters(i)
    }
    centers
  }

 

迭代做聚类代码:

 

/**
   * 迭代做聚类
   */  def kmeans(points:List[UserFeatureBean],centers:Array[Vector[Double]]) = {
    var bool = true
    var newCenters = Array[Vector[Double]]()
    var move = 0.0
    //当前的代价函数值
    var currentCost = 0.0 
    var newCost = 0.0
    //根据每个样本点最近的聚类中心进行groupBy分组
    while(bool){
      //迭代更新聚类中心,直到最优
      move = 0.0
      currentCost = computeCost(points,centers)
      val cluster: Map[Vector[Double], List[UserFeatureBean]] = points.groupBy(v => closestCenter(centers,v.getVector))
      newCenters =
        centers.map(oldCenter => {
          cluster.get(oldCenter) match {
          //找到该聚类中心所拥有的点集
            case Some(pointsInThisCluster) =>
              val a: List[Vector[Double]] = pointsInThisCluster.map(v=>v.getVector)
              
              //均值作为新的聚类中心
              vectorDivide(a.reduceLeft((v1, v2) => vectorAdd(v1,v2)),pointsInThisCluster.length)
            case None => oldCenter
          }
        })
      for(i <- 0 to centers.length - 1){
        centers(i) = newCenters(i)
      }
 
      //新的代价函数值
      newCost = computeCost(points,centers)
      if(math.sqrt(vectorDis(Vector(currentCost),Vector(newCost))) < 0.0000000001)
        bool = false
    }
   
  }

 

输出聚类结果代码:

 

/**
   * 输出聚类结果
   * @param points
   * @param centers
   * @return
   */  def printResult(points:List[UserFeatureBean],centers:Array[Vector[Double]]) = {
    //将每个点的聚类中心用centers中的下标表示,属于同一类的点拥有相同的下标
    val pointsNum = points.length
    val pointsLabel = new Array[Int](pointsNum)
    var closetCenter = Vector[Double]()
    //"聚类结果如下
    for(i <- 0 to pointsNum - 1){
      closetCenter = centers.reduceLeft((c1,c2) => if (vectorDis(c1,points(i).getVector) < vectorDis(c2,points(i).getVector)) c1 else c2)
      pointsLabel(i) = centers.indexOf(closetCenter)
      points(i).setGroup(pointsLabel(i))
    }
    points.map(v=>{
      println(v.getVector+"-----------"+v.getId+"-----------"+v.getGroup)
    })
  }

 

找到某样本点所属的聚类中心代码:

 

/**
   * 找到某样本点所属的聚类中心
   * @param centers
   * @param v
   * @return
   */  def closestCenter(centers:Array[Vector[Double]],v:Vector[Double]):Vector[Double] = {
    centers.reduceLeft((c1,c2) =>
      if(vectorDis(c1,v) < vectorDis(c2,v)) c1 else c2
    )
  }

 

向量间的欧式距离代码:

 

/**
   * 向量间的欧式距离
   * @param v1
   * @param v2
   * @return
   */  def vectorDis(v1: Vector[Double], v2: Vector[Double]):Double = {
    var distance = 0.0
    for(i <- 0 to v1.length - 1){
      distance += (v1(i) - v2(i)) * (v1(i) - v2(i))
    }
    distance = math.sqrt(distance)
    distance
  }

 

向量加法代码:

 

/**
   * 向量加法
   * @param v1
   * @param v2
   * @return
   */  def vectorAdd(v1:Vector[Double],v2:Vector[Double]):Vector[Double] = {
    var v3 = v1
    for(i <- 0 to v1.length - 1){
      v3 = v3.updated(i,v1(i) + v2(i))
    }
    v3
  }

 

向量除法代码:

 

/**
   * 向量除法
   * @param v
   * @param num
   * @return
   */  def vectorDivide(v:Vector[Double],num:Int):Vector[Double] = {
    var r = v
    for(i <- 0 to v.length - 1){
      r = r.updated(i,r(i) / num)
    }
    r
  }

 

实体类:

 

class UserFeatureBean extends Serializable{
  @BeanProperty var id:String=null
  @BeanProperty var group:Int= -1
  @BeanProperty var vector:Vector[Double]=null
  @BeanProperty var mapVector:Map[String,Vector[Double]]=null
}

 

def main(args: Array[String]): Unit = {
        val centers = kmeansServer.kmeansppInitial(List[UserFeatureBean], 10)
    kmeansServer.kmeans(List[UserFeatureBean], centers)
    kmeansServer.printResult(List[UserFeatureBean], centers)
}

Be First to Comment

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注