Prepare the Dataset and Code
In this step, we'll set up the necessary files and code to simulate the ancient data processing system.
First, change the user to hadoop
and then switch to the home directory of the hadoop
user:
su - hadoop
Create a new directory called distributed-cache-lab
and navigate to it:
mkdir distributed-cache-lab
cd distributed-cache-lab
Next, create a text file named ancient-texts.txt
with the following content:
The wisdom of the ages is eternal.
Knowledge is the path to enlightenment.
Embrace the mysteries of the universe.
This file will represent the ancient texts we want to process.
Now, create a Java file named AncientTextAnalyzer.java
with the following code:
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class AncientTextAnalyzer {
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: AncientTextAnalyzer <in> <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "Ancient Text Analyzer");
job.setJarByClass(AncientTextAnalyzer.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
This code is a simple MapReduce program that counts the occurrences of each word in the input file. We'll use this code to demonstrate the usage of the distributed cache in Hadoop.