当前位置:网站首页>Traditional K-means implementation
Traditional K-means implementation
2022-07-24 06:10:00 【A little cute C】
Operation class
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
public class KmeansRun {
public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException {
long start = System.currentTimeMillis();// Starting time
String dataName = args[0];
int k = Integer.parseInt(args[1]);
// int k = 3;
// String dataName = "iris.txt";// The input path of the file
String outName ="result.txt";
String outPath = DataUtil.HDFS_OUTPUT + "/" + outName;;// Output path of the file
String dataPath = DataUtil.HDFS_INPUT + "/" + dataName;
double dis ;// The difference between the old cluster and the new cluster
ArrayList<ArrayList<Double>> data;
ArrayList<ArrayList<Double>> center;// Cluster center
ArrayList<ArrayList<Double>> newcenter ;// New cluster center
// Reading data
data = DataUtil.readCenter(dataPath);
// Calculation center
center = CenterRandom.centerRandomChoice(data,k);
// With cluster marks data
ArrayList<DataWithIndex.dataWithIndex> dataindex ;
// Cycle through the new center , Until the center point does not change
for (int i = 0; i < 1000; i++) {
System.out.println("--------------------------"+i+"----------------------------------");
// Distribute data
dataindex = DistributionData.distributData(data,center);
// Calculate the new center
newcenter = CalCenters.calCenters(dataindex,k);
// System.out.println(i+"-------------dataindex--------------------");
// for (int p = 0; p < dataindex.size(); p++) {
// System.out.println(dataindex.get(p).getIndex()+" "+dataindex.get(p).getData());
// }
// System.out.println("-----------------newcenter------------------");
// for (int j = 0; j < newcenter.size(); j++) {
// System.out.println(newcenter.get(j));
// }
//System.out.println(newcenter);
// Calculate the difference between the new cluster center and the old cluster center
dis = CalUtil.calDistanceBetweenCenters(newcenter,center);
System.out.println("------------------------dis----------------------------");
System.out.println(dis);
// If the difference is 0, Exit loop
if(dis ==0){
break;
}
// Save the new cluster , In the next cycle, the new cluster will become the old cluster
center = newcenter;
}
// System.out.println(newcenter);
// System.out.println(" The final result ");
// for (int i = 0; i < dataindex.size(); i++) {
// System.out.println(dataindex.get(i).getIndex()+" "+dataindex.get(i).getData());
// }
// System.out.println("dataindex.size==================="+dataindex.size());
// The final clustering result
dataindex = DistributionData.distributData(data,center);
writeOut.writeout(outPath,dataindex);
long end = System.currentTimeMillis();// End time
System.out.println(" Total time consuming "+(end-start)+" millisecond ");// Time consuming
}
}
Create a data storage method with cluster labels
import java.util.ArrayList;
// Data with cluster markers
public class DataWithIndex {
static class dataWithIndex{
ArrayList<Double> data;// data
int index;// Cluster marker
public ArrayList<Double> getData() {
return data;
}
public int getIndex() {
return index;
}
public void setData(ArrayList<Double> data) {
this.data = data;
}
public void setIndex(int index) {
this.index = index;
}
}
public DataWithIndex() {
}
}
Select the initial random center class
import java.util.ArrayList;
import java.util.Random;
public class CenterRandom {
// Random generation k A central point
public static ArrayList<ArrayList<Double>> centerRandomChoice(ArrayList<ArrayList<Double>> data, int k){
ArrayList<ArrayList<Double>> center = new ArrayList<>();
int elementsSize = data.size();
int rm;
int j;
Random random = new Random();
for (int i = 0; i < k; i++) {
rm = random.nextInt();
j = Math.abs(rm % elementsSize);
center.add(data.get(j));
}
return center;
}
}
Calculate the new cluster center class
import java.util.ArrayList;
public class CalCenters {
// First, calculate the cluster mark as 0 The center of the cluster , Then calculate the cluster and mark it as 1,2,,,k
/*
Parameters
center Used to return the final k Cluster centers
add When adding a new point in a cluster , Save the newly calculated cluster center
centerone Save the current cluster center
*/
public static ArrayList<ArrayList<Double>> calCenters(ArrayList<DataWithIndex.dataWithIndex> dataindex,int k){
ArrayList<ArrayList<Double>> center = new ArrayList<>();
ArrayList<Double> add ;
for (int n = 0; n < k; n++) {
ArrayList<Double> centerone = new ArrayList<>();
int count = 0;// Count , Calculate how many points there are in this class
// Traverse dataindex, If centerindex There is no cluster mark in , Just add , Otherwise, update centerindex.data
for (DataWithIndex.dataWithIndex one : dataindex) {
if (one.getIndex() == n) {
if (centerone.size() == 0) {
centerone.addAll(one.getData());
//centerone = one.getData();
count++;
} else {
add = CalUtil.addElement(centerone, one.getData());
centerone = add;
count++;
}
}
}
for (int i = 0; i < centerone.size(); i++) {
centerone.set(i,centerone.get(i)/count);
}
center.add(centerone);
}
return center;
}
}
Calculate new cluster labels
import java.util.ArrayList;
public class DistributionData {
// Calculate which cluster the data belongs to
public static ArrayList<DataWithIndex.dataWithIndex> distributData(ArrayList<ArrayList<Double>> data, ArrayList<ArrayList<Double>> center){
ArrayList<DataWithIndex.dataWithIndex> dataindex = new ArrayList<>();
double dis;
int index = 0;
for (int i = 0; i < data.size(); i++) {
DataWithIndex.dataWithIndex onedataindex = new DataWithIndex.dataWithIndex();
double min = 10000.0;
for (int j = 0; j < center.size(); j++) {
dis = CalUtil.calDistance(data.get(i),center.get(j));
if(dis<min){
min = dis;
index = j;
}
}
onedataindex.setData(data.get(i));
onedataindex.setIndex(index);
dataindex.add(onedataindex);
}
return dataindex;
}
}
Various numerical calculation classes
import java.util.ArrayList;
// Calculation tool class , Two value distance , Select the nearest center point, etc
public class CalUtil {
// Calculate the distance between two vectors , European style
public static double calDistance(ArrayList<Double> element1, ArrayList<Double> element2){
double disSum = 0;
for(int i=0;i<element1.size();i++){
disSum += (element1.get(i) - element2.get(i)) * (element1.get(i) - element2.get(i));
}
return Math.sqrt(disSum);
}
// Add the elements
public static ArrayList<Double> addElement(ArrayList<Double> element1, ArrayList<Double> element2){
for(int i=0;i<element1.size();i++) {
element1.set(i, element1.get(i) + element2.get(i));
}
return element1;
}
// Mainly to make use of CalUtil.calDistanceBetweenCenters Calculate the distance difference between the old and new center points , Because it is difficult to control the threshold information , Directly wait until the two groups of central points are exactly the same to realize shutdown , return true.
// Calculate whether the center of the two iterations has changed , Return distance
public static double calDistanceBetweenCenters(ArrayList<ArrayList<Double>>oldCenter, ArrayList<ArrayList<Double>>newCenter){
// because data The reading order of is the same , Therefore, the order of clustering centers is the same when finally converging
// Just traverse and calculate the distance , Do not consider the order of the center point itself
if(oldCenter.size() > newCenter.size())
return 1000;
double sum = 0;
for(int i=0;i<oldCenter.size();i++){
double singleDistance = calDistance(oldCenter.get(i), newCenter.get(i));
sum += singleDistance;
}
return sum;
}
}
File read class ( I paste it directly here hdfs Read mode on , Line by line reading )
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;
import java.io.IOException;
import java.util.ArrayList;
// Realization hdfs Read and delete the central point file in
// The central point file is placed in output in
public class DataUtil {
public static final String HDFS_INPUT = "hdfs://hadoop102:8020/input"; // input Address
public static final String HDFS_OUTPUT = "hdfs://hadoop102:8020/output"; // output Address
// Cut the string into double Array set
public static ArrayList<Double> splitStringIntoArray(String line) {
ArrayList<Double> center = new ArrayList<Double>();
String[] lineContextArry = line.split("\t");
for (String s : lineContextArry) {
Double c = Double.parseDouble(s);
center.add(c);
}
return center;
}
// Get the corresponding file hdfs Under the system linereader
private static LineReader getLineReader(String filePath) throws IOException {
Path path = new Path(filePath);
Configuration conf = new Configuration();
FileSystem fileSystem = path.getFileSystem(conf);
FSDataInputStream fsdis = fileSystem.open(path);
LineReader lineReader = new LineReader(fsdis, conf);
return lineReader;
}
// Read in center
private static void readCenterLines(LineReader lineReader, ArrayList<ArrayList<Double>> centers) throws IOException {
Text line = new Text();
// Read once per line
while (lineReader.readLine(line) > 0) {
ArrayList<Double> center = splitStringIntoArray(line.toString().trim());
centers.add(center);
}
lineReader.close();
}
// Read the center point
// It could be a folder , Traversal read
public static ArrayList<ArrayList<Double>> readCenter(String centerPath) throws IOException {
ArrayList<ArrayList<Double>> centers = new ArrayList<ArrayList<Double>>();
Path path = new Path(centerPath);
Configuration conf = new Configuration();
FileSystem fileSystem = path.getFileSystem(conf);
if (fileSystem.isDirectory(path)) {
// Folder , Traversal read
FileStatus[] listFile = fileSystem.listStatus(path);
for (FileStatus fileStatus : listFile) {
LineReader lineReader = getLineReader(fileStatus.getPath().toString());
readCenterLines(lineReader, centers);
}
} else {
// Ordinary documents , Direct reading
LineReader lineReader = getLineReader(centerPath);
readCenterLines(lineReader, centers);
}
return centers;
}
}File write out class
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
public class writeOut {
public static void writeout(String outPath, ArrayList<DataWithIndex.dataWithIndex> dataindex) throws URISyntaxException, IOException, InterruptedException {
//1、 create profile
// Confirm yes org.apache.hadoop.conf.Configuration;
Configuration conf = new Configuration();
// 2、 Get file system
FileSystem fs = FileSystem.get(new URI("hdfs://hadoop102:8020"), conf, "hong");
// 3、 call ApI operation
// Create files and write data
int a;
FSDataOutputStream in = fs.create(new Path(outPath));
for (int i = 0; i < dataindex.size(); i++) {
// Write the cluster label
a = dataindex.get(i).getIndex();
String s = String.valueOf(a);
char c = s.charAt(0);
in.writeInt(c);
//in.writeByte(dataindex.get(i).getIndex());
in.write('\t');
// Write a data
for (int j = 0; j < dataindex.get(i).getData().size(); j++) {
in.write(dataindex.get(i).getData().get(j).toString().getBytes());
//in.write(data.get(i).get(j).toString().getBytes());
in.write('\t');
}
in.write('\n');
in.flush();
}
fs.close();
}
}
边栏推荐
- Paper reading endmember guided unmixing network (EGU net)
- Qt新建工程简介
- 顺序栈 C语言 进栈 出栈 遍历
- QT char to qstring hexadecimal and char to hexadecimal integer
- HoloLens2开发:使用MRTK并且模拟眼动追踪
- Machine learning (zhouzhihua) Chapter 5 notes on neural network learning
- The detailed process of connecting MySQL with QT and outputting data to QT window tablewidget.
- [activiti] process variables
- Machine learning (zhouzhihua) Chapter 2 model selection and evaluation notes learning experience
- 树莓派大用处,利用校园网搭建一个校园局域网站
猜你喜欢

Statistical analysis of catering data --- Teddy cloud course homework

JVM系统学习

Machine learning (Zhou Zhihua) Chapter 3 Notes on learning linear models

On the concepts of "input channel" and "output channel" in convolutional neural networks

How to download videos on the web

HoloLens 2 开发:开发环境部署

HoloLens 2 开发101:创建首个HoloLens 2应用程序

谷歌/火狐浏览器管理后台新增账号时用户名密码自动填入的问题

MySql下载,及安装环境设置

Common features of ES6
随机推荐
The detailed process of connecting MySQL with QT and outputting data to QT window tablewidget.
Bat batch script, running multiple files at the same time, batch commands executed in sequence, and xshell script.
Channel attention and spatial attention module
Installation of tensorflow and pytorch frames and CUDA pit records
How to solve the problem of large distribution gap between training set and test set
day2-WebSocket+排序
String methods and instances
JDBC elementary learning ------ (learning from Shang Silicon Valley)
Opencv reads avi video and reports an error: number < Max_ number in function ‘icvExtractPattern
HoloLens 2 开发101:创建首个HoloLens 2应用程序
The problem that the user name and password are automatically filled in when Google / Firefox manages the background new account
Qt char型转QString型 16进制与char型 转 16进制整型
Machine learning (zhouzhihua) Chapter 5 notes on neural network learning
JUC并发编程基础(9)--线程池
Qt新手入门级 计算器加、减、乘、除、应用
Answers and analysis of some after-school exercises in signals and systems (Wujing)
[activiti] process variables
Paper reading endmember guided unmixing network (EGU net)
树莓派大用处,利用校园网搭建一个校园局域网站
HoloLens2开发:使用MRTK并且模拟眼动追踪