You are on page 1of 6

HDFS Commands #

Open a terminal window to the current working hadoop fs -du -s -h hadoop/retail


directory. # 13. Delete a file 'customers' from the "retail" directory.
# /home/training #
# 1. Print the Hadoop version hadoop fs -rm hadoop/retail/customers
hadoop version # 14. Ensure this file is no longer in HDFS.
# 2. List the contents of the root directory in HDFS #
# hadoop fs -ls hadoop/retail/customers
hadoop fs -ls / # 15. Delete all files from the "retail" directory using a
# 3. Report the amount of space used and wildcard.
# available on currently mounted filesystem #
# hadoop fs -rm hadoop/retail/*
hadoop fs -df hdfs:/ # 16. To empty the trash
# 4. Count the number of directories,files and bytes #
under hadoop fs -expunge
# the paths that match the specified file pattern # 17. Finally, remove the entire retail directory and all
# # of its contents in HDFS.
hadoop fs -count hdfs:/ #
# 5. Run a DFS filesystem checking utility hadoop fs -rm -r hadoop/retail
# # 18. List the hadoop directory again
hadoop fsck - / #
# 6. Run a cluster balancing utility hadoop fs -ls hadoop
# # 19. Add the purchases.txt file from the local directory
hadoop balancer # named "/home/training/" to the hadoop directory you
# 7. Create a new directory named "hadoop" below the created in HDFS
# /user/training directory in HDFS. Since you're #
# currently logged in with the "training" user ID, hadoop fs -copyFromLocal
# /user/training is your home directory in HDFS. /home/training/purchases.txt hadoop/
# # 20. To view the contents of your text file purchases.txt
hadoop fs -mkdir /user/training/hadoop # which is present in your hadoop directory.
# 8. Add a sample text file from the local directory #
# named "data" to the new directory you created in hadoop fs -cat hadoop/purchases.txt
HDFS # 21. Add the purchases.txt file from "hadoop" directory
# during the previous step. which is present in HDFS directory
# # to the directory "data" which is present in your local
hadoop fs -put data/sample.txt /user/training/hadoop directory
# 9. List the contents of this new directory in HDFS. #
# hadoop fs -copyToLocal hadoop/purchases.txt
hadoop fs -ls /user/training/hadoop /home/training/data
# 10. Add the entire local directory called "retail" to the # 22. cp is used to copy files between directories
# /user/training directory in HDFS. present in HDFS
# #
hadoop fs -put data/retail /user/training/hadoop hadoop fs -cp /user/training/*.txt
# 11. Since /user/training is your home directory in /user/training/hadoop
HDFS, # 23. '-get' command can be used alternaively to '-
# any command that does not have an absolute path is copyToLocal' command
# interpreted as relative to that directory. The next #
# command will therefore list your home directory, and hadoop fs -get hadoop/sample.txt /home/training/
# should show the items you've just added there. # 24. Display last kilobyte of the file "purchases.txt" to
# stdout.
hadoop fs -ls #
# 12. See how much space this directory occupies in hadoop fs -tail hadoop/purchases.txt
HDFS. # 25. Default file permissions are 666 in HDFS
# Use '-chmod' command to change permissions of a file
#
hadoop fs -ls hadoop/purchases.txt
sudo -u hdfs hadoop fs -chmod 600
hadoop/purchases.txt
# 26. Default names of owner and group are
training,training
# Use '-chown' to change owner name and group name
simultaneously
#
hadoop fs -ls hadoop/purchases.txt
sudo -u hdfs hadoop fs -chown root:root
hadoop/purchases.txt
# 27. Default name of group is training
# Use '-chgrp' command to change group name
#
hadoop fs -ls hadoop/purchases.txt
sudo -u hdfs hadoop fs -chgrp training
hadoop/purchases.txt
# 28. Move a directory from one location to other
#
hadoop fs -mv hadoop apache_hadoop
# 29. Default replication factor to a file is 3.
# Use '-setrep' command to change replication factor of
a file
#
hadoop fs -setrep -w 2 apache_hadoop/sample.txt
# 30. Copy a directory from one node in the cluster to
another
# Use '-distcp' command to copy,
# -overwrite option to overwrite in an existing files
# -update command to synchronize both directories
#
hadoop fs -distcp hdfs://namenodeA/apache_hadoop
hdfs://namenodeB/hadoop
# 31. Command to make the name node leave safe
mode
#
hadoop fs -expunge
sudo -u hdfs hdfs dfsadmin -safemode leave
# 32. List all the hadoop file system shell commands
#
hadoop fs
# 33. Last but not least, always ask for help!
#
hadoop fs -help
6 Implement Bloom Filter using Python/R Programming. 19. count = 0
7 def hash(x): 20. encountered_one = False
8 return (6 * x + 1 ) % 5 21. for digit in binary[::-1]:
9 22. if digit == '0' and not encountered_one:
10 def count_zeros(binary): 23. count += 1
11 count = 0 24. elif digit == '1':
12 max_count = 0 25. encountered_one = True
13 encountered_one = False # Add a flag to track if '1' is 26. if not encountered_one:
encountered 27. count = 0
14 for bit in reversed(binary): 28. result.append(count)
15 if bit == '0' and not encountered_one: 29. return result
16 count += 1 30.
17 elif bit == '1': 31. def main():
18 encountered_one = True 32. size = int(input("Enter the size of the array: "))
19 count = 0 33. input_array = []
20 max_count = max(count, max_count) 34.
21 return max_count 35. print("Enter elements of the array:")
22 36. for i in range(size):
23 if __name__ == "__main__": 37. element = int(input(f"Element {i + 1}: "))
24 size = int(input("enter the size of stream: ")) 38. input_array.append(element)
25 input_array = [] 39.
26 40. hashed_array = [hash(x) for x in input_array]
27 for i in range(size): 41.
28 element = int(input("element {}:".format(i + 1))) 42. binary_array = [to_three_bit_binary(x) for x in
29 hashed_element = hash(element) hashed_array]
30 binary_rep = bin(hashed_element)[2:].zfill(3) 43.
31 input_array.append(binary_rep) 44. trailing_zeros_array =
32 count_trailing_zeros(binary_array)
33 print("Binary representation of element {} is: 45.
{}".format(i + 46. max_trailing_zeros = max(trailing_zeros_array)
1, binary_rep)) 47. print("Maximum trailing zeros:", max_trailing_zeros)
34 48. power_of_two = 2 ** max_trailing_zeros
35 max_zeros = max(count_zeros(binary) for binary in 49. print("the number of distinct elements:",
input_array) power_of_two)
36 distinct = 2 ** max_zeros 50.
37 51. if __name__ == "__main__":
38 print("Maximum consecutive zeros count: ", 52. main()
max_zeros) 53.
39 print("Distinct count: ", distinct) Programming Using R
40 1. (A) HDFS Command
7. Implement FM algorithm using Python/R (B) Write a R program to create a Data Frame which
Programming. contain details of 5 employees and display
8. def hash(x): summary of the data using R.
9. return (6 * x + 1) % 5 Ans:
10. newdata <- data.frame(
11. def to_three_bit_binary(num): "name" = c("harsh","pankaj","ishwar","rahuk","roy"),
12. binary = bin(num)[2:] "salary" = c("600","388","688","688","455"),
13. binary = binary.zfill(3) "year" = c("6","3","5","8","5")
14. return binary )
15. print(summary(newdata))
16. def count_trailing_zeros(arr): 2. (A) HDFS Command
17. result = [] (B) For anydataset visialize the following types of chart :
18. for binary in arr: Scatterplot. Bubble Chart, Bar Chart ,
Dot Plots ,Histogram ,Box Plot ,Pie Chart 3 wednesday <- c(5, 18, 33, 4, 12)
(a) HDFS Command thursday <- c(11, 20, 6, 13, 20)
(B) Write the script in R to sort the values contained in friday <- c(9, 15, 12, 12, 23)
the following vector in ascending order # Create a bar plot for Monday
and descending order: (23, 45, 10, 34, 89, 20, 67, 99). barplot(monday, names.arg = product, col = "blue",
Demonstrate the output using graph. main = "Marks on Monday", xlab =
ANS: "Product", ylab = "Marks")
# Create the vector # You can create similar bar plots for other days by
vector <- c(23, 45, 10, 34, 89, 20, 67, 99) replacing "monday" with the respective
# Sort the vector in ascending order day's data vector.
sorted_vector_ascending <- sort(vector) barplot(monday,product,col = "blue", main = "marksss",
# Sort the vector in descending order xlab = "class", ylab = "marks")
sorted_vector_descending <- sort(vector, decreasing = # Function to create a bar plot
TRUE) create_bar_plot <- function(day, day_name) {
# Set up a 3x1 grid for subplots barplot(day, names.arg = product, col = "blue",
par(mfrow = c(3, 1)) main = paste("Marks on", day_name), xlab = "Product",
# Create a sequence of index labels ylab = "Marks")
index_labels <- 1:length(vector) }
# Plot the original vector # Create bar plots for each day
barplot(vector, main = "Original Vector", xlab = "Index", par(mfrow = c(1, 5)) # Arrange plots in a row
ylab = "Value", col = "blue", names.arg create_bar_plot(monday, "Monday")
= index_labels) create_bar_plot(tuesday, "Tuesday")
# Plot the vector in ascending order create_bar_plot(wednesday, "Wednesday")
barplot(sorted_vector_ascending, main = "Sorted in create_bar_plot(thursday, "Thursday")
Ascending Order", xlab = "Index", ylab = create_bar_plot(friday, "Friday")
"Value", col = "green", names.arg = index_labels) par(mfrow = c(1, 1)) # Reset the layout
# Plot the vector in descending order 5 (a) HDFS Command
barplot(sorted_vector_descending, main = "Sorted in (b) Consider the following data frame given below:
Descending Order", xlab = "Index", ylab Subject Class Marks
= "Value", col = "red", names.arg = index_labels) 1 1 56
4 (A) HDFS Command 2 2 75
(B) The following table shows the number of units of 3 1 48
different products sold on different days: 4 2 69
Create five sample numeric vectors from this data 5 1 84
visualize data using R. 6 2 53
# Create the data frame (i) Create a subset of subject less than 4 by using subset
df <- data.frame(Subject = 1:6, Class = c(1, 2, 1, 2, 1, 2), () funcon and demonstrate the
Marks = c(56, 75, 48, 69, 84, 53)) output.
cl <- c(1, 2, 3, 4, 5, 6) (ii) Create a subject where the subject column is less
mr <- c(56, 75, 48, 69, 84, 53) than 3 and the class equals to 2 by using
barplot(mr,cl,col = "blue", main = "marksss", xlab = [] brackets and demonstrate the output using R
"class", ylab = "marks") ANS:
product <- # Create the data frame
c("bread","milk","cola","chocolate","detergent") df <- data.frame(Subject = 1:6, Class = c(1, 2, 1, 2, 1, 2),
monday <- c(12,21,10,6,5) Marks = c(56, 75, 48, 69, 84, 53))
tuesday <- c(3,27,1,7,8) # Create a subset of subjects less than 4
wednesday <- c(5,18,33,4,12) subset_df <- subset(df, Subject < 4)
thursday <- c(11,20,6,13,20) # Display the subset
friday <- c(9,15,12,12,23) subset_df
product <- c("bread", "milk", "cola", "chocolate", # Create a subset using [ ] brackets
"detergent") subset_df <- df[df$Subject < 3 & df$Class == 2, ]
monday <- c(12, 21, 10, 6, 5) # Display the subset
tuesday <- c(3, 27, 1, 7, 8) subset_df
(iii) Visualize data "Salaries" = c(25000, 48000, 62000, 45000, 51000)
Ans )
# Create the data frame # Combine the new data with the existing data using
df <- data.frame(Subject = 1:6, Class = c(1, 2, 1, 2, 1, 2), rbind
Marks = c(56, 75, 48, 69, 84, 53)) combined_data <- rbind(employee_data,
cl <- c(1, 2, 3, 4, 5, 6) new_employees)
mr <- c(56, 75, 48, 69, 84, 53) # Print the combined data
barplot(mr,cl,col = "blue", main = "marksss", xlab = print(combined_data)
"class", ylab = "marks") # Create a vector of salaries
6 (a) HDFS Command salaries <- combined_data$Salaries
(b)The data analyst of Argon technology Mr. John needs # Create a vector of employee names for labeling the
to enter the salaries of 10 employees bars
in R. The salaries of the employees are given in the employee_names <- combined_data$`Name of
following table: employees`
Sr. No. Name of employees Salaries # Create the bar chart
1 Vivek 21000 barplot(salaries, names.arg = employee_names, col =
2 Karan 55000 "blue", main = "Employee Salaries",
3 James 67000 xlab = "Employee Name", ylab = "Salary")
4 Soham 50000 # Create a vector of salaries
5 Renu 54000 salaries <- combined_data$Salaries
6 Farah 40000 # Create a vector of years of experience (example data)
7 Hetal 30000 years_of_experience <- c(3, 5, 7, 4, 6, 2, 4, 8, 1, 2, 5, 3,
8 Mary 70000 6, 7, 4)
9 Ganesh 20000 # Create a scatter plot
10 Krish 15000 plot(years_of_experience, salaries, main = "Scatter Plot
i) Which R command will Mr. John use to enter these of Salaries vs. Years of
values demonstrate the Experience",
output. xlab = "Years of Experience", ylab = "Salary")
ii) Now Mr. John wants to add the salaries of 5 new # Add labels for each point (employee name)
employees in the existing table, text(years_of_experience, salaries, labels =
which command he will use to join datasets with new combined_data$`Name of employees`, pos =
values in R. Demonstrate 3)
the output. 7 (a) HDFS Command
(iv) Visialize the data using chart . (b) Analyse and visualize churn modelling data using R.
Ans : 8 (a) HDFS Command
# Create a data frame with the initial salaries of 10 (b) Analyse and visualize IRIS data using R.
employees 9 (a) HDFS Command
employee_data <- data.frame( (b) Analyse and visualize supermarket data using R.
"Sr. No." = 1:10, Ans:
"Name of employees" = c("Vivek", "Karan", "James", # Load the necessary library (only if it's not already
"Soham", "Renu", "Farah", "Hetal", loaded)
"Mary", "Ganesh", "Krish"), # install.packages("readr")
"Salaries" = c(21000, 55000, 67000, 50000, 54000, library(readr)
40000, 30000, 70000, 20000, 15000) # Read the Iris dataset from a CSV file
) iris_data <- read.csv("iris.csv") # Replace "iris.csv" with
# Print the data frame the actual file path
print(employee_data) # Create a box plot
# Create a data frame with the salaries of 5 new boxplot(iris_data$PetalLengthCm ~ iris_data$Species,
employees xlab = "Species",
new_employees <- data.frame( ylab = "Petal Length (cm)",
"Sr. No." = 11:15, main = "Box Plot of Petal Length by Species")
"Name of employees" = c("harsh", "pankaj", "ishwar", 10 (a) HDFS Command
"disha", "kabir"), (b) Analyse and visualize Loan data using R.
Ans:
# Print the head of the data (top 5 rows)
head(df)
# Print the column names
cat("Column Names: ", paste(names(df), collapse = ", "),
"\n")
# Print a summary of all columns
summary(df)
# Check for missing values in each column
missing_values <- colSums(is.na(df_filled))
# Identify columns with missing values
columns_with_missing_data <-
names(missing_values[missing_values > 0])
# Print column names with missing data
cat("Columns with missing data:\n")
for (col in columns_with_missing_data) {
cat(col, " (Missing values: ", missing_values[col], ")\n")
}
# Calculate the medians for the specified columns
median_loan_amount <-
median(df_filled$LoanAmount, na.rm = TRUE)
median_loan_term <-
median(df_filled$Loan_Amount_Term, na.rm = TRUE)
median_credit_history <-
median(df_filled$Credit_History, na.rm = TRUE)
# Fill missing values with medians
df_filled$LoanAmount[is.na(df_filled$LoanAmount)] <-
median_loan_amount
df_filled$Loan_Amount_Term[is.na(df_filled$Loan_Amo
unt_Term)] <- median_loan_term
df_filled$Credit_History[is.na(df_filled$Credit_History)]
<- median_credit_history
hist(df$ApplicantIncome, col = "lightgreen", main =
"Applicant Income Histogram")
status_counts <- table(df$Loan_Status)
pie(status_counts, labels = status_counts, col =
c("green", "red"), main = "Loan Status")
barplot(table(df$Gender), col = "skyblue", main =
"Gender Distribution")

You might also like