Hadoop教程

hadoop初步

本文主要是介绍hadoop初步,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!

设置环境变量

vim ~/.profile
export HADOOP_HOME=/home/mmc/hadoop

hadoop 配置

vim etc/hadoop/hadoop-env.sh

export JAVA_HOME=/opt/java/jdk1.8.0_151

vim etc/hadoop/core-site.xml

<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://0.0.0.0:9000</value>
        <description>hdfs通讯访问地址</description>
    </property>
    <property>
      <name>dfs.permissions</name>
      <value>false</value>
    </property>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>file:/home/mmc/hadoop/tmp</value>
        <description>hadoop数据存放</description>
    </property>
</configuration>

vim etc/hadoop/hdfs-site.xml

<configuration>
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>file:/home/mmc/hadoop/hdfs/name</value>
    </property>
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>file:/home/mmc/hadoop/hdfs/data</value>
    </property>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
</configuration>

本地免密访问

ssh-keygen
cd ~/.ssh
touch authorized_keys
chmod 600 authorized_keys
cat id_rsa.pub >> authorized_keys

格式化

./bin/hdfs namenode -format

服务端打印日志

export HADOOP_ROOT_LOGGER=DEBUG,console
./sbin/start-all.sh

检查

./bin/hadoop fs -ls /
./bin/hadoop fs -mkdir -p /user/hadoop/input
./bin/hadoop fs -ls /user/hadoop/input

pom.xml

    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-common</artifactId>
      <version>3.3.0</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-hdfs</artifactId>
      <version>3.3.0</version>
      <scope>test</scope>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>3.3.0</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-yarn-common -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-yarn-common</artifactId>
      <version>3.3.0</version>
    </dependency>


    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-yarn-api -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-yarn-api</artifactId>
      <version>3.3.0</version>
    </dependency>

log 配置

src/main/resources/log4j.properties

# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=DEBUG, A1

# A1 is set to be a ConsoleAppender.
log4j.appender.A1=org.apache.log4j.ConsoleAppender

# A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n

java 代码

WordCount.java

package org.example;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WordCount {
    static class WordCountMapper extends Mapper<LongWritable,Text,Text,IntWritable>{

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //拿到一行数据,将输入的序列化数据转换成字符串
            String line = value.toString();
            //将一行数据按照分隔符拆分
            String[] words = line.split("\t");
            //遍历单词数据,输出单词<k,1>
            for(String word:words){
                //需要序列化写出
                context.write(new Text(word),new IntWritable(1));
            }
        }
    }
    static class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
        //reduce方法是针对输入的一组数据,一个key和它的所有value组成一组(k:v1,v2,v3)
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            //定义一个计数器
            int count = 0;
            //遍历一组数据,将key出现次数累加到count
            for(IntWritable value : values){
                count += value.get();
            }
            context.write(key,new IntWritable(count));

        }
    }
}

App.java

package org.example;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class App
{
    public static void main( String[] args ) throws Exception{
        String jobName = "word count";
//        String inputPath = "hdfs://192.168.56.200:9000/user/hadoop/input/";
        String inputPath = "hdfs://192.168.0.24:9000/user/hadoop/input/";
//        String inputPath = "/user/hadoop/input/";

//        String outputPath = "hdfs://192.168.56.200:9000/user/hadoop/output/";
        String outputPath = "/home/mmc/downloads/hadoop/output";

        Configuration conf = new Configuration();

//        conf.set("fs.defaultFS", "hdfs://192.168.56.200:9000");
        conf.set("fs.hdfs.impl","org.apache.hadoop.hdfs.DistributedFileSystem");
        conf.set("dfs.client.block.write.replace-datanode-on-failure.policy", "NEVER");
        conf.set("dfs.client.use.datanode.hostname", "true");

        Job job = Job.getInstance(conf);
        job.setJobName(jobName);

        job.setJarByClass(WordCount.class);

        job.setMapperClass(WordCount.WordCountMapper.class);
        job.setReducerClass(WordCount.WordCountReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job,new Path(inputPath));
        FileOutputFormat.setOutputPath(job,new Path(outputPath));
        System.exit(job.waitForCompletion(true)?0:1);
    }
}
这篇关于hadoop初步的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!