Tuesday, November 16, 2010

How to get distinct values/lines (dedupe) for a file using Hadoop Map Reduce Framework Hadoop

The following prorgam takes in a text file with a single column and returns the distinct list of lines in the file in the output directory

1. Create file CalculateDistinct.java and paste the following code

package org.myorg;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class CalculateDistinct {
public static class Map extends MapReduceBase implements Mapper {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text("");
public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException {
word.set(value.toString());
output.collect(word,one);
}
}
public static class Reduce extends MapReduceBase implements Reducer {
public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += 1;
values.next();
}
output.collect(key, new IntWritable(sum));
}
}

public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(CalculateDistinct.class);
conf.setJobName("Calculate Distinct");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}

2. Compile, create Jar and Run

javac -classpath hadoop-0.20.1-dev-core.jar -d CalculateDistinct/ CalculateDistinct.java
jar -cvf CalculateDistinct.jar -C CalculateDistinct/ .
hadoop jar CalculateDistinct.jar org.myorg.CalculateDistinct in/abc.txt out

No comments:

Post a Comment