Mapreduce WordCount:如何高效统计长尾词频次?

2026-04-16 16:225阅读0评论SEO基础
  • 内容介绍
  • 文章标签
  • 相关推荐

本文共计517个文字,预计阅读时间需要3分钟。

Mapreduce WordCount:如何高效统计长尾词频次?

Hadoop MapReduce 实现WordCount,代码片段分为三个源文件:

1.WordCountMapper.java

javapackage com.elon.bigdata.hadoop.mr.wordcount;

import java.io.IOException;import org.apache.commons.lang.StringUtils;import org.apache.hadoop.io.Text;

public class WordCountMapper extends org.apache.hadoop.mapreduce.Mapper { public void map(Object key, Text value, Context context) throws IOException, InterruptedException { String[] words=StringUtils.split(value.toString()); for (String word : words) { if (word !=null && !word.isEmpty()) { context.write(new Text(word), new Text(1)); } } }}

Mapreduce WordCount:如何高效统计长尾词频次?

Hadoop-MapReduce之WordCount的实现

本代码片段分为三个源文件 WordCountMapper.java

package com.elon.bigdata.hadoop.mr.wordcount; import java.io.IOException; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class WordCountMapper extends Mapper { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //获取到一行文件的内容 String line = value.toString(); //切分这一行的内容为一个单词数组 String[] words = StringUtils.split(line," "); //遍历输出 for(String word:words){ context.write(new Text(word), new LongWritable(1)); } } } WordCountReducer.java

package com.elon.bigdata.hadoop.mr.wordcount; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class WordCountReducer extends Reducer { // key:hello, values:{1,1,1,1,1,1.....} @Override protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { // 定义一个累加计数器 long count = 0; for(LongWritable value:values){ count += value.get(); } // 输出<单词:count>键值对 context.write(key, new LongWritable(count)); } } WordCountRunner.java

package com.elon.bigdata.hadoop.mr.wordcount; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WordCountRunner { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job wcjob = Job.getInstance(); //设置job所使用的jar包 conf.set("mapreduce.job.jar", "wcount.jar"); //设置wcjob中的资源所在的jar包 wcjob.setJarByClass(WordCountRunner.class); //wcjob要使用哪个mapper类 wcjob.setMapperClass(WordCountMapper.class); //wcjob要使用哪个reducer类 wcjob.setReducerClass(WordCountReducer.class); //wcjob的mapper类输出的kv数据类型 wcjob.setMapOutputKeyClass(Text.class); wcjob.setMapOutputValueClass(LongWritable.class); //wcjob的reducer类输出的kv数据类型 wcjob.setOutputKeyClass(Text.class); wcjob.setOutputValueClass(LongWritable.class); //指定要处理的原始数据所存放的路径 FileInputFormat.setInputPaths(wcjob, "hdfs://hadoop:9000/wc/srcdata"); //指定处理之后的结果输出到哪个路径 FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://hadoop:9000/wc/output")); boolean res = wcjob.waitForCompletion(true); System.exit(res?0:1); } }

本文共计517个文字,预计阅读时间需要3分钟。

Mapreduce WordCount:如何高效统计长尾词频次?

Hadoop MapReduce 实现WordCount,代码片段分为三个源文件:

1.WordCountMapper.java

javapackage com.elon.bigdata.hadoop.mr.wordcount;

import java.io.IOException;import org.apache.commons.lang.StringUtils;import org.apache.hadoop.io.Text;

public class WordCountMapper extends org.apache.hadoop.mapreduce.Mapper { public void map(Object key, Text value, Context context) throws IOException, InterruptedException { String[] words=StringUtils.split(value.toString()); for (String word : words) { if (word !=null && !word.isEmpty()) { context.write(new Text(word), new Text(1)); } } }}

Mapreduce WordCount:如何高效统计长尾词频次?

Hadoop-MapReduce之WordCount的实现

本代码片段分为三个源文件 WordCountMapper.java

package com.elon.bigdata.hadoop.mr.wordcount; import java.io.IOException; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class WordCountMapper extends Mapper { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //获取到一行文件的内容 String line = value.toString(); //切分这一行的内容为一个单词数组 String[] words = StringUtils.split(line," "); //遍历输出 for(String word:words){ context.write(new Text(word), new LongWritable(1)); } } } WordCountReducer.java

package com.elon.bigdata.hadoop.mr.wordcount; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class WordCountReducer extends Reducer { // key:hello, values:{1,1,1,1,1,1.....} @Override protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { // 定义一个累加计数器 long count = 0; for(LongWritable value:values){ count += value.get(); } // 输出<单词:count>键值对 context.write(key, new LongWritable(count)); } } WordCountRunner.java

package com.elon.bigdata.hadoop.mr.wordcount; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WordCountRunner { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job wcjob = Job.getInstance(); //设置job所使用的jar包 conf.set("mapreduce.job.jar", "wcount.jar"); //设置wcjob中的资源所在的jar包 wcjob.setJarByClass(WordCountRunner.class); //wcjob要使用哪个mapper类 wcjob.setMapperClass(WordCountMapper.class); //wcjob要使用哪个reducer类 wcjob.setReducerClass(WordCountReducer.class); //wcjob的mapper类输出的kv数据类型 wcjob.setMapOutputKeyClass(Text.class); wcjob.setMapOutputValueClass(LongWritable.class); //wcjob的reducer类输出的kv数据类型 wcjob.setOutputKeyClass(Text.class); wcjob.setOutputValueClass(LongWritable.class); //指定要处理的原始数据所存放的路径 FileInputFormat.setInputPaths(wcjob, "hdfs://hadoop:9000/wc/srcdata"); //指定处理之后的结果输出到哪个路径 FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://hadoop:9000/wc/output")); boolean res = wcjob.waitForCompletion(true); System.exit(res?0:1); } }