The following prorgam takes in a text file with a single column and returns the distinct list of lines in the file in the output directory
1. Create file CalculateDistinct.java and paste the following code
2. Compile, create Jar and Run
1. Create file CalculateDistinct.java and paste the following code
package org.myorg;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class CalculateDistinct {
public static class Map extends MapReduceBase implements Mapper {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text("");
public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException {
word.set(value.toString());
output.collect(word,one);
}
}
public static class Reduce extends MapReduceBase implements Reducer {
public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += 1;
values.next();
}
output.collect(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(CalculateDistinct.class);
conf.setJobName("Calculate Distinct");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
2. Compile, create Jar and Run
javac -classpath hadoop-0.20.1-dev-core.jar -d CalculateDistinct/ CalculateDistinct.java
jar -cvf CalculateDistinct.jar -C CalculateDistinct/ .
hadoop jar CalculateDistinct.jar org.myorg.CalculateDistinct in/abc.txt out
No comments:
Post a Comment