# 현재 사용중인 데이터베이스 출력

hive> show databases;
OK
default
test_database
Time taken: 2.173 seconds

hive> set hive.cli.print.current.db=true;
hive (default)> USE test_database;
OK
Time taken: 0.014 seconds

hive (test_database)> set hive.cli.print.current.db=false;
hive>

# Hive 데이터 베이스 생성

- Hive 기본 데이터베이스 default 외에 추가로 데이터베이스 생성
hive> show databases;
OK
default
Time taken: 2.883 seconds

-- testdb 생성
hive> create database testdb;
OK
Time taken: 0.163 seconds

hive> show databases;
OK
default
testdb
Time taken: 0.041 seconds

# Hive 테이블 생성및 데이터 임포트, 쿼리 테스트

hive> show tables;
OK

hive> CREATE TABLE books(isbn INT, title STRING); 
OK
Time taken: 0.291 seconds

hive> show tables;
OK
books
Time taken: 0.095 seconds

hive> DESCRIBE books;
OK
isbn    int
title   string
Time taken: 0.131 seconds

hive> 
hive> CREATE TABLE users (id INT, name STRING) PARTITIONED BY (vcol STRING);
OK
Time taken: 0.048 seconds

hive> DESC users;
OK
id      int
name    string
vcol    string
Time taken: 0.069 seconds

hive> SHOW TABLES;
OK
books
users
Time taken: 0.032 seconds

hive> ALTER TABLE books ADD COLUMNS (author STRING, category STRING);
OK
Time taken: 0.115 seconds

hive> DESC books;
OK
isbn    int
title   string
author  string
category        string
Time taken: 0.072 seconds

hive> ALTER TABLE books CHANGE author author ARRAY<STRING> COMMENT "multi-valued";
OK
Time taken: 0.1 seconds
hive> DESC books;
OK
isbn    int
title   string
author  array<string>   multi-valued
category        string
Time taken: 0.079 seconds

hive> ALTER TABLE books RENAME TO published_contents;
OK
Time taken: 0.708 seconds
hive> DESCRIBE published_contents;
OK
isbn    int
title   string
author  array<string>   multi-valued
category        string
Time taken: 0.085 seconds

hive> 
hive> DROP TABLE published_contents;
OK
Time taken: 1.993 seconds
hive> DROP TABLE users;             
OK
Time taken: 0.231 seconds
hive> show tables;
OK

# 테스트(영화 관련) 데이터 다운로드

사이트 : http://www.grouplens.org/node/73

[hadoop@master data_files]$ wget http://www.grouplens.org/system/files/ml-1m.zip

[hadoop@master data_files]$ unzip ml-1m.zip
[hadoop@master data_files]$ cd ml-1m
[hadoop@master ml-1m]$ ls -al
합계 24336
drwxr-x---. 2 hadoop hadoop     4096 2011-08-25 04:22 .
drwxrwxr-x. 5 hadoop hadoop     4096 2013-05-29 17:29 ..
-rw-r-----. 1 hadoop hadoop     5189 2011-08-25 04:22 README
-rw-r-----. 1 hadoop hadoop   171308 2003-03-27 06:18 movies.dat
-rw-r-----. 1 hadoop hadoop 24594131 2003-03-01 06:53 ratings.dat
-rw-r-----. 1 hadoop hadoop   134368 2003-03-01 06:53 users.dat
[hadoop@master ml-1m]$ more ratings.dat
------------------------------------
UserID::MovieID::Rating::Timestamp
------------------------------------
1::1193::5::978300760
1::661::3::978302109
1::914::3::978301968
1::3408::4::978300275
1::2355::5::978824291
1::1197::3::978302268
1::1287::5::978302039
1::2804::5::978300719
1::594::4::978302268
1::919::4::978301368
1::595::5::978824268
1::938::4::978301752
1::2398::4::978302281
1::2918::4::978302124

[hadoop@master ml-1m]$ more movies.dat
1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
3::Grumpier Old Men (1995)::Comedy|Romance
4::Waiting to Exhale (1995)::Comedy|Drama
5::Father of the Bride Part II (1995)::Comedy
6::Heat (1995)::Action|Crime|Thriller
7::Sabrina (1995)::Comedy|Romance
8::Tom and Huck (1995)::Adventure|Children's
9::Sudden Death (1995)::Action
10::GoldenEye (1995)::Action|Adventure|Thriller

[hadoop@master ml-1m]$ more users.dat
1::F::1::10::48067
2::M::56::16::70072
3::M::25::15::55117
4::M::45::7::02460
5::M::25::20::55455
6::F::50::9::55117
7::M::35::1::06810
8::M::25::12::11413
9::M::25::17::61614
10::F::35::1::95370

# 데이터 임포트 편의를 위해 구분자 변경(:: 에서 #)
[hadoop@master ml-1m]$ sed s/::/#/g movies.dat > movies.dat.hash_delimited
[hadoop@master ml-1m]$ sed s/::/#/g ratings.dat > ratings.dat.hash_delimited
[hadoop@master ml-1m]$ sed s/::/#/g users.dat > users.dat.hash_delimited
[hadoop@master ml-1m]$ ls
README      movies.dat.hash_delimited  ratings.dat.hash_delimited  users.dat.hash_delimited
movies.dat  ratings.dat                users.dat

# 테이블 생성및 데이터 임포트

hive> CREATE TABLE ratings(
    >   userid INT,
    >   movieid INT,
    >   rating INT,
    >   tstamp STRING)
    >   ROW FORMAT DELIMITED
    >   FIELDS TERMINATED BY '#'
    >   STORED AS TEXTFILE;
OK
Time taken: 0.095 seconds
hive> show tables;
OK
ratings
Time taken: 0.027 seconds
hive> 
hive> LOAD DATA LOCAL INPATH '/home/hadoop/data_files/ml-1m/ratings.dat.hash_delimited' 
    >   OVERWRITE INTO TABLE ratings;
Copying data from file:/home/hadoop/data_files/ml-1m/ratings.dat.hash_delimited
Copying file: file:/home/hadoop/data_files/ml-1m/ratings.dat.hash_delimited
Loading data to table default.ratings
Deleted hdfs://master.namenode:9000/user/hive/warehouse/ratings
Table default.ratings stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 21593504, raw_data_size: 0]
OK
Time taken: 2.121 seconds
hive> 

hive> SELECT COUNT(*) FROM ratings;                                                     
Total MapReduce jobs = 1
Launching Job 1 out of 1
OK
1000209
Time taken: 14.049 seconds
hive> 

hive> SELECT * FROM ratings LIMIT 5;
OK
1       1193    5       978300760
1       661     3       978302109
1       914     3       978301968
1       3408    4       978300275
1       2355    5       978824291
Time taken: 0.082 seconds

hive> CREATE TABLE movies(
    >   movieid INT,
    >   title STRING,
    >   genres STRING)
    >   ROW FORMAT DELIMITED
    >   FIELDS TERMINATED BY '#'
    >   STORED AS TEXTFILE;
OK
Time taken: 0.141 seconds

hive> LOAD DATA LOCAL INPATH '/home/hadoop/data_files/ml-1m/movies.dat.hash_delimited'
    >   OVERWRITE INTO TABLE movies;

hive> SELECT COUNT(*) FROM movies;
Total MapReduce jobs = 1
Launching Job 1 out of 1
OK
3883
Time taken: 12.02 seconds
hive> 

hive> CREATE TABLE movies_2(
    >   movieid INT,
    >   title STRING,
    >   genres ARRAY<STRING>)
    >   ROW FORMAT DELIMITED
    >   FIELDS TERMINATED BY '#'
    >   COLLECTION ITEMS TERMINATED BY '|'
    >   STORED AS TEXTFILE;
OK
Time taken: 0.051 seconds

hive> SHOW tables;                                                                    
OK
movies
movies_2
ratings
Time taken: 0.039 seconds

hive> LOAD DATA LOCAL INPATH '/home/hadoop/data_files/ml-1m/movies.dat.hash_delimited'
    >   OVERWRITE INTO TABLE movies_2;
Copying data from file:/home/hadoop/data_files/ml-1m/movies.dat.hash_delimited
Copying file: file:/home/hadoop/data_files/ml-1m/movies.dat.hash_delimited
Loading data to table default.movies_2
Deleted hdfs://master.namenode:9000/user/hive/warehouse/movies_2
Table default.movies_2 stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 163542, raw_data_size: 0]
OK
Time taken: 0.321 seconds

hive> SELECT COUNT(*) FROM movies_2;
Total MapReduce jobs = 1
Launching Job 1 out of 1
OK
3883
Time taken: 13.086 seconds
hive> 

hive> SELECT * FROM movies limit 5;
OK
1       Toy Story (1995)        Animation|Children''s|Comedy
2       Jumanji (1995)  Adventure|Children''s|Fantasy
3       Grumpier Old Men (1995) Comedy|Romance
4       Waiting to Exhale (1995)        Comedy|Drama
5       Father of the Bride Part II (1995)      Comedy
Time taken: 0.081 seconds

hive> SELECT * FROM movies limit 10;
OK
1       Toy Story (1995)        Animation|Children''s|Comedy
2       Jumanji (1995)  Adventure|Children''s|Fantasy
3       Grumpier Old Men (1995) Comedy|Romance
4       Waiting to Exhale (1995)        Comedy|Drama
5       Father of the Bride Part II (1995)      Comedy
6       Heat (1995)     Action|Crime|Thriller
7       Sabrina (1995)  Comedy|Romance
8       Tom and Huck (1995)     Adventure|Children''s
9       Sudden Death (1995)     Action
10      GoldenEye (1995)        Action|Adventure|Thriller
Time taken: 0.069 seconds
hive> 

hive> CREATE TABLE users(
    >   userid INT,
    >   gender STRING,
    >   age INT,
    >   occupation INT,
    >   zipcode STRING)
    >   ROW FORMAT DELIMITED
    >   FIELDS TERMINATED BY '#'
    >   STORED AS TEXTFILE;
OK
Time taken: 0.055 seconds

hive> LOAD DATA LOCAL INPATH '/home/hadoop/data_files/ml-1m/users.dat.hash_delimited'
    >   OVERWRITE INTO TABLE users;
OK
Time taken: 0.278 seconds

hive> SELECT COUNT(*) FROM users;
Total MapReduce jobs = 1
Launching Job 1 out of 1
OK
6040
Time taken: 13.066 seconds
hive> 

hive> SELECT * FROM USERS LIMIT 5;
OK
1       F       1       10      48067
2       M       56      16      70072
3       M       25      15      55117
4       M       45      7       02460
5       M       25      20      55455
Time taken: 0.073 seconds
hive> 


hive> add FILE /home/hadoop/data_files/ch12/ch12/occupation_mapper.py;
Added resource: /home/hadoop/data_files/ch12/ch12/occupation_mapper.py
hive> INSERT OVERWRITE TABLE users_2
    >   SELECT
    >   TRANSFORM (userid, gender, age, occupation, zipcode)
    >   USING 'python occupation_mapper.py'
    >   AS (userid, gender, age, occupation_str, zipcode)
    >   FROM users;

occupation_mapper.py
occupation_dict = { 0:  "other or not specified",
  1:  "academic/educator",
  2:  "artist",
  3:  "clerical/admin",
  4:  "college/grad student",
  5:  "customer service",
  6:  "doctor/health care",
  7:  "executive/managerial",
  8:  "farmer",
  9:  "homemaker",
  10:  "K-12 student",
  11:  "lawyer",
  12:  "programmer",
  13:  "retired",
  14:  "sales/marketing",
  15:  "scientist",
  16:  "self-employed",
  17:  "technician/engineer",
  18:  "tradesman/craftsman",
  19:  "unemployed",
  20:  "writer"
}

for line in sys.stdin:
  line = line.strip()
  userid, gender, age, occupation, zipcode = line.split('#')
  occupation_str = occupation_map[occupation]
  print '#'.join([userid, gender, age, occupation_str, zipcode])

# HDFS 데이터 조회

[hadoop@master bin]$ hadoop fs -lsr /user/hive
drwxrwxr-x   - hadoop supergroup          0 2013-05-29 18:01 /user/hive/warehouse
drwxr-xr-x   - hadoop supergroup          0 2013-05-20 21:59 /user/hive/warehouse/hadoop.db
drwxr-xr-x   - hadoop supergroup          0 2013-05-20 21:59 /user/hive/warehouse/hadoop.db/apachelog
drwxr-xr-x   - hadoop supergroup          0 2013-05-29 17:49 /user/hive/warehouse/movies
-rw-r--r--   1 hadoop supergroup     163542 2013-05-29 17:49 /user/hive/warehouse/movies/movies.dat.hash_delimited
drwxr-xr-x   - hadoop supergroup          0 2013-05-29 17:51 /user/hive/warehouse/movies_2
-rw-r--r--   1 hadoop supergroup     163542 2013-05-29 17:51 /user/hive/warehouse/movies_2/movies.dat.hash_delimited
drwxr-xr-x   - hadoop supergroup          0 2013-05-29 17:44 /user/hive/warehouse/ratings
-rw-r--r--   1 hadoop supergroup   21593504 2013-05-29 17:44 /user/hive/warehouse/ratings/ratings.dat.hash_delimited
drwxr-xr-x   - hadoop supergroup          0 2013-05-29 17:54 /user/hive/warehouse/users
-rw-r--r--   1 hadoop supergroup     110208 2013-05-29 17:54 /user/hive/warehouse/users/users.dat.hash_delimited
drwxr-xr-x   - hadoop supergroup          0 2013-05-29 18:01 /user/hive/warehouse/users_2

# 데이터 조회

hive> SELECT title FROM movies
    > WHERE title LIKE '%Party%';
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there''s no reduce operator
Starting Job = job_201305270954_0018, Tracking URL = http://master.namenode:50030/jobdetails.jsp?jobid=job_201305270954_0018
Kill Command = /home/hadoop/hadoop-1.1.2/libexec/../bin/hadoop job  -kill job_201305270954_0018
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2013-05-29 18:27:54,716 Stage-1 map = 0%,  reduce = 0%
2013-05-29 18:27:56,725 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.92 sec
2013-05-29 18:27:57,731 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 0.92 sec
MapReduce Total cumulative CPU time: 920 msec
Ended Job = job_201305270954_0018
MapReduce Jobs Launched: 
Job 0: Map: 1   Cumulative CPU: 0.92 sec   HDFS Read: 163776 HDFS Write: 299 SUCCESS
Total MapReduce CPU Time Spent: 920 msec
OK
Party Girl (1995)
House Party 3 (1994)
It''s My Party (1995)
Bachelor Party (1984)
House Party (1990)
House Party 2 (1991)
Psycho Beach Party (2000)
Beach Party (1963)
Pajama Party (1964)
Slumber Party Massacre, The (1982)
Slumber Party Massacre II, The (1987)
Slumber Party Massacre III, The (1990)
Time taken: 5.996 seconds
hive> 

hive> SELECT * FROM movies  
    > ORDER BY movieid DESC 
    > LIMIT 5;
3952    Contender, The (2000)   Drama|Thriller
3951    Two Family House (2000) Drama
3950    Tigerland (2000)        Drama
3949    Requiem for a Dream (2000)      Drama
3948    Meet the Parents (2000) Comedy
Time taken: 13.001 seconds
hive> 

hive> SELECT * FROM movies          
    > ORDER BY title DESC           
    > LIMIT 5;
2600    eXistenZ (1999) Action|Sci-Fi|Thriller
2698    Zone 39 (1997)  Sci-Fi
1426    Zeus and Roxanne (1997) Children''s
1364    Zero Kelvin (Kj?rlighetens kj?tere) (1995)      Action
1845    Zero Effect (1998)      Comedy|Thriller
Time taken: 13.036 seconds
hive> 

hive> SELECT ratings.userid, ratings.rating, ratings.tstamp, movies.title
    > FROM ratings JOIN movies
    > ON(ratings.movieid = movies.movieid)
    > LIMIT 5;
OK
1489    2       974759943       Toy Story (1995)
669     5       975631430       Toy Story (1995)
2398    4       974385474       Toy Story (1995)
1860    5       975258612       Toy Story (1995)
2011    4       974676716       Toy Story (1995)
Time taken: 14.093 seconds

hive> SELECT ratings.userid, ratings.rating, ratings.tstamp, movies.title
    > FROM ratings JOIN movies
    > ON(ratings.movieid = movies.movieid)
    > ORDER BY ratings.userid ASC
    > LIMIT 5;
Total MapReduce jobs = 2
Launching Job 1 out of 2
...
Launching Job 2 out of 2
Total MapReduce CPU Time Spent: 24 seconds 410 msec
OK
1       4       978300055       Titanic (1997)
1       5       978301777       Mary Poppins (1964)
1       5       978301753       Sound of Music, The (1965)
1       5       978824195       Schindler''s List (1993)
1       4       978301368       Wizard of Oz, The (1939)
Time taken: 35.86 seconds

hive> SELECT ratings.userid, ratings.rating, ratings.tstamp, movies.title, users.gender
    > FROM ratings JOIN movies ON(ratings.movieid = movies.movieid)
    > JOIN users ON(ratings.userid = users.userid)
    > LIMIT 5;
Total MapReduce jobs = 2
Launching Job 1 out of 2
...
Launching Job 2 out of 2
Total MapReduce CPU Time Spent: 26 seconds 70 msec
OK
1       4       978300055       Titanic (1997)  F
1       5       978301777       Mary Poppins (1964)     F
1       5       978301753       Sound of Music, The (1965)      F
1       5       978824195       Schindler''s List (1993) F
1       4       978301368       Wizard of Oz, The (1939)        F
Time taken: 29.855 seconds

# UNION, 인라인뷰 쿼리

hive> SELECT user_id, rating_count
    > FROM(SELECT ratings.userid AS user_id, COUNT(ratings.rating) AS rating_count
    > FROM ratings
    > WHERE ratings.rating = 5
    > GROUP BY ratings.userid
    > ) top_raters
    > WHERE rating_count > 15
    > LIMIT 10;
Total MapReduce jobs = 1
Launching Job 1 out of 1
...
Total MapReduce CPU Time Spent: 4 seconds 400 msec
OK
1       18
2       34
5       21
6       17
8       45
9       17
10      154
11      29
17      69
18      105
Time taken: 14.065 seconds
hive> 

# 실행계획(EXPLAIN) 출력

hive> EXPLAIN SELECT COUNT(*) FROM ratings
    > WHERE movieid = 1 AND rating = 5;  

# 실행계획(EXPLAIN) & 물리적 파일 정보(EXTENDED)

hive> EXPLAIN EXTENDED SELECT COUNT(*) FROM ratings
    > WHERE movieid = 1 AND rating = 5;     


참고 : NoSQL 프로그래밍


※ 위 내용은, 여러 자료를 참고하거나 제가 주관적으로 정리한 것입니다.
   잘못된 정보나 보완이 필요한 부분을, 댓글 또는 메일로 보내주시면 많은 도움이 되겠습니다.
05 29, 2013 20:04 05 29, 2013 20:04


Trackback URL : http://develop.sunshiny.co.kr/trackback/915

Leave a comment

« Previous : 1 : ... 109 : 110 : 111 : 112 : 113 : 114 : 115 : 116 : 117 : ... 648 : Next »

Recent Posts

  1. HDFS - Python Encoding 오류 처리
  2. HP - Vertica ROS Container 관련 오류...
  3. HDFS - Hive 실행시 System Time 오류
  4. HP - Vertica 사용자 쿼리 이력 테이블...
  5. Client에서 HDFS 환경의 데이터 처리시...

Recent Comments

  1. 안녕하세요^^ 배그핵
  2. 안녕하세요^^ 도움이 되셨다니, 저... sunshiny
  3. 정말 큰 도움이 되었습니다.. 감사합... 사랑은
  4. 네, 안녕하세요. 댓글 남겨 주셔서... sunshiny
  5. 감사합니다 많은 도움 되었습니다!ㅎㅎ 프리시퀸스

Recent Trackbacks

  1. amazon fire television amazon fire television %M
  2. how to broadcast your own tv station how to broadcast your own tv station %M
  3. elapsed time clock for operating r... elapsed time clock for operating r... %M
  4. Mysql - mysql 설치후 Character set... 멀고 가까움이 다르기 때문 %M

Calendar

«   12 2019   »
1 2 3 4 5 6 7
8 9 10 11 12 13 14
15 16 17 18 19 20 21
22 23 24 25 26 27 28
29 30 31        

Bookmarks

  1. 위키피디아
  2. MysqlKorea
  3. 오라클 클럽
  4. API - Java
  5. Apache Hadoop API
  6. Apache Software Foundation
  7. HDFS 생태계 솔루션
  8. DNSBL - Spam Database Lookup
  9. Ready System
  10. Solaris Freeware
  11. Linux-Site
  12. 윈디하나의 솔라나라

Site Stats

TOTAL 2780506 HIT
TODAY 89 HIT
YESTERDAY 1360 HIT