当前位置：网站首页>MapReduce elementary programming practice

MapReduce elementary programming practice

2022-06-28 01:31:00 【wr456wr】

List of articles

Experimental environment

ubuntu18.04 Virtual machines and a win10 Physical host
Programming environment IDEA
virtual machine ip：192.168.1.108
JDK：1.8

Experimental content

Use Java Program a WordCount Program , And package the program into Jar The package executes in the virtual machine

use first IDEA Create a Maven project
stay pom.xml Import dependencies into the file and package them as Jar Package plug-ins ：

    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>3.3.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>3.3.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-core</artifactId>
            <version>1.2.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase</artifactId>
            <version>2.4.11</version>
            <type>pom</type>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>2.4.11</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-common</artifactId>
            <version>2.4.11</version>
        </dependency>
    </dependencies>

    <build>
        <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
            <plugins>
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-jar-plugin</artifactId>
                    <configuration>
                        <archive>
                            <manifest>
                                <mainClass>com.MyProgramDriver</mainClass>
                            </manifest>
                        </archive>
                    </configuration>
                </plugin>
            </plugins>
        </pluginManagement>
    </build>
</project>

Write the corresponding program ：
MyProgramDriver Class is used to execute program entries ：

import org.apache.hadoop.util.ProgramDriver;

public class MyProgramDriver {
    
    public static void main(String[] args) {
    
        int exitCode = -1;
        ProgramDriver programDriver = new ProgramDriver();
        try {
    
            programDriver.addClass("com.WordCount", WordCount.class, "com.WordCount Program");
            exitCode = programDriver.run(args);
        } catch (Throwable e) {
    
            throw new RuntimeException(e);
        }
        System.exit(exitCode);
    }
}

;
WordCount Program ：


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

public class WordCount {
    
    public WordCount() {
    
    }

    public static void main(String[] args) throws Exception {
    

        Configuration conf = new Configuration();
        String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
        if (otherArgs.length < 2) {
    
            System.err.println("Usage: wordcount <in> [<in>...] <out>");
            System.exit(2);
        }

        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        for(int i = 0; i < otherArgs.length - 1; ++i) {
    
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }

    public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    
        private IntWritable result = new IntWritable();

        public IntSumReducer() {
    
        }

        public void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
    
            int sum = 0;

            IntWritable val;
            for(Iterator var5 = values.iterator(); var5.hasNext(); sum += val.get()) {
    
                val = (IntWritable)var5.next();
            }

            this.result.set(sum);
            context.write(key, this.result);
        }
    }

    public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
    
        private static final IntWritable one = new IntWritable(1);
        private Text word = new Text();

        public TokenizerMapper() {
    
        }

        public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
    
            StringTokenizer itr = new StringTokenizer(value.toString());

            while(itr.hasMoreTokens()) {
    
                this.word.set(itr.nextToken());
                context.write(this.word, one);
            }

        }
    }
}

;

Screenshot of project structure ：
Insert picture description here

Click on the right maven Of package Package the project as Jar file
Insert picture description here

The package file after the package is completed is in target Under the table of contents
Insert picture description here
And then we'll pack it up Jar Packets are sent to the virtual machine , I am in /root/hadoop/a_dir Under the table of contents , Put it anywhere , But I need to know where it is

;
Then write the input file input1 and input2, The contents are ：

Then upload the two files to hadoop The system path of , I put it here hadoop Of /root/input Under the table of contents , Note that it is not a physical path , yes Hadoop Network path after startup
Insert picture description here
;
Then execute the program ：

bin/hadoop jar a_dir/MyMapReduce-1.0-SNAPSHOT.jar com.WordCount /root/input/* /root/out

among a_dir/MyMapReduce-1.0-SNAPSHOT.jar It needs to be carried out Jar The path of the package ,com.WordCount It needs to be carried out WordCount Program name , The name is in the MyProgramDriver Name indicated in

Insert picture description here

/root/input/* Is the input file , /root/out Is the output path

;

Look at the output ：
Insert picture description here

Programming to achieve file merging and de duplication operation

sample input ：

The main idea ： Use map Use regular to split each line of the file into key,value , Such as the 20150101 x After the split key by 20150101,value by x, The type is Text type , take map Treated by shuffle Process to reduce To deal with , stay reduce Internal use HashSet The de duplication characteristic of （ stay HashSet The elements in the do not repeat ） De duplicate the entered value .

;

Merge Program code ：


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;

public class Merge {
    

    public Merge() {
    
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    


        Configuration conf = new Configuration();
        String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
        if (otherArgs.length < 2) {
    
            System.err.println("Usage: merge <in> [<in>...] <out>");
            System.exit(2);
        }
        Job job = Job.getInstance(conf, "merge");
        job.setJarByClass(Merge.class);
        job.setMapperClass(Merge.MyMapper.class);
        job.setCombinerClass(Merge.MyReduce.class);
        job.setReducerClass(Merge.MyReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        for(int i = 0; i < otherArgs.length - 1; ++i) {
    
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }


    public static class MyMapper extends Mapper<Object, Text, Text, Text> {
    

        public MyMapper() {
    

        }

        @Override
        public void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context) throws IOException, InterruptedException {
    
            String line = value.toString();
            // Match blanks 
            String[] split = line.split("\\s+");
            if (split.length <= 1) {
    
                return;
            }
            context.write(new Text(split[0]), new Text(split[1]));
        }
    }

    public static class MyReduce extends Reducer<Text, Text, Text, Text> {
    

        public MyReduce() {
    

        }

        @Override
        public void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
    
            // Use HashSet Carry out de duplication operation 
            HashSet<String> hashSet = new HashSet<>();
            Iterator<Text> iterator = values.iterator();
            while (iterator.hasNext()) {
    
                hashSet.add(iterator.next().toString());
            }
            Iterator<String> hashIt = hashSet.iterator();
            while (hashIt.hasNext()) {
    
                Text val = new Text(hashIt.next());
                context.write(key, val);
            }
        }
    }
}

take Merge Program write MyProgramDriver class ：


import org.apache.hadoop.util.ProgramDriver;

public class MyProgramDriver {
    
    public static void main(String[] args) {
    
        int exitCode = -1;
        ProgramDriver programDriver = new ProgramDriver();
        try {
    
            programDriver.addClass("com.WordCount", WordCount.class, "com.WordCount Program");
            programDriver.addClass("Merge", Merge.class, "xll");
            exitCode = programDriver.run(args);
        } catch (Throwable e) {
    
            throw new RuntimeException(e);
        }
        System.exit(exitCode);
    }
}

Package the program and send it to the virtual machine , Run the program ：

bin/hadoop jar a_dir/MyMapReduce-1.0-SNAPSHOT.jar Merge /root/input/* /root/out

Running results ：
Insert picture description here

Program to sort the input files

Ideas ： stay Map The end separates the values to form <key,1> Such key value pairs , Because the sort is MapReduce Default action for , So in Reduce The end only needs to Map Output the value separated from the terminal , take Map Terminal key Value is set to Reduce Terminal value value .

MyConf Class code ：
Here, I have extracted the general configuration required , Reduce code duplication in the future

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;

public class MyConf {
    
    public static void setConf(Class mainClass,Class outKeyClass, Class outValueClass, String[] args) throws IOException, InterruptedException, ClassNotFoundException, InstantiationException, IllegalAccessException {
    

        Configuration conf = new Configuration();
        String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
        if (otherArgs.length < 2) {
    
            System.err.println("otherArgs length error, length < 2");
            System.exit(2);
        }
        Job job = Job.getInstance(conf, mainClass.getName());
        Class[] innerClass = mainClass.getClasses();
        for (Class c : innerClass) {
    
            if (c.getSimpleName().equals("MyReduce")) {
    
                job.setReducerClass(c);
// job.setCombinerClass(c);
            } else if (c.getSimpleName().equals("MyMapper")) {
    
                job.setMapperClass(c);
            }
         }
        job.setJarByClass(mainClass);
        job.setOutputKeyClass(outKeyClass);
        job.setOutputValueClass(outValueClass);

        for(int i = 0; i < otherArgs.length - 1; ++i) {
    
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

;

Sort class ：

import com.utils.MyConf;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class Sort {
    
    public Sort() {
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, InstantiationException, IllegalAccessException {
    

        MyConf.setConf(Sort.class, IntWritable.class, IntWritable.class, args);
    }

    public static class MyMapper extends Mapper<Object, org.apache.hadoop.io.Text, IntWritable, IntWritable> {
    

        @Override
        protected void map(Object key, Text value, Mapper<Object, Text, IntWritable, IntWritable>.Context context) throws IOException, InterruptedException {
    
            String var = value.toString();
            context.write(new IntWritable(Integer.parseInt(var.trim())), new IntWritable(1));
        }
    }

    public static class MyReduce extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
    

       static int sort = 1;

        @Override
        protected void reduce(IntWritable key, Iterable<IntWritable> values, Reducer<IntWritable, IntWritable, IntWritable, IntWritable>.Context context) throws IOException, InterruptedException {
    

            for (IntWritable va : values) {
    
                context.write(new IntWritable(sort), key);
                sort++;
            }

        }
    }
}

And then Sort Class injection MyProgramDriver Class is OK
Insert picture description here

Program input ：
Insert picture description here

After packaging, put it into the virtual machine to run

bin/hadoop jar a_dir/MyMapReduce-1.0-SNAPSHOT.jar Sort /root/input* /root/out5

Running results ：
Insert picture description here

Information mining for a given table

Ideas ：( Reference resources ), for instance ：
steven lucy
lucy mary
This input goes through map（map Refer to the following code for the specific logic of ） Get output after coming out ：
<steven,old#lucy>,<lucy,young#steven>,<lucy,old#mary>,<mary,young#lucy>,
After that shuffle After processing, you get input ：
<steven,old#lucy>,<lucy,<young#steven,old#mary>>,<mary,young#lucy>,
Then each key value pair is treated as Reduce End input
<lucy,<young#steven,old#mary>> Key value pairs pass through reduce A valid output is obtained after the logic processing of ：
<steven, mary>

InfoFind class ：

package com;

import com.utils.MyConf;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.ArrayList;

public class InfoFind {
    

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, InstantiationException, IllegalAccessException {
    
        MyConf.setConf(InfoFind.class, Text.class, Text.class, args);
    }

    public static class MyMapper extends Mapper<Object, Text, Text, Text> {
    

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
    
            String[] splStr = value.toString().split("\\s+");
            String child = splStr[0];
            String parent = splStr[1];

            if (child.equals("child") && parent.equals("parent"))
                return;
            context.write(new Text(child), new Text("old#" + parent));
            context.write(new Text(parent), new Text("young#" + child));
        }
    }

    public static class MyReduce extends Reducer<Text, Text, Text, Text> {
    
        private static boolean head = true ;
        public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException
        {
    
            if(head)
            {
    
                context.write(new Text("grandchild"), new Text("grandparent"));
                head = false;
            }
            ArrayList<String> grandchild = new ArrayList<>();
            ArrayList<String> grandparent = new ArrayList<>();
            String[] temp;
            for(Text val:values)
            {
    
                temp = val.toString().split("#");
                if(temp[0].equals("young"))
                    grandchild.add(temp[1]);
                else
                    grandparent.add(temp[1]);
            }

            for(String gc:grandchild)
                for(String gp:grandparent)
                    context.write(new Text(gc), new Text(gp));
        }
    }

}

Input ：
Insert picture description here

function ：

bin/hadoop jar a_dir/MyMapReduce-1.0-SNAPSHOT.jar InfoFind /root/input/* /root/out6

Output ：
Insert picture description here

Reference material

https://blog.csdn.net/u013384984/article/details/80229459 ( One a key Content )
https://blog.csdn.net/qq_43310845/article/details/123298811
https://blog.csdn.net/zhangwenbingbingyu/article/details/52210348
https://www.cnblogs.com/ginkgo-/p/13273671.html

原网站

版权声明
本文为[wr456wr]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/178/202206271957196609.html