-
Notifications
You must be signed in to change notification settings - Fork 13
/
copyMerge.py
46 lines (35 loc) · 1.72 KB
/
copyMerge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# This function implements copyMerge from Hadoop API
# copyMerge will be deprecated in Hadoop 3.0
# This can be used in a pySpark application (assumes `sc` variable exists)
def copyMerge (src_dir, dst_file, overwrite=False, deleteSource=False, debug=False):
# this function has been migrated to https://github.com/Tagar/abalon Python package
hadoop = sc._jvm.org.apache.hadoop
conf = hadoop.conf.Configuration()
fs = hadoop.fs.FileSystem.get(conf)
# check files that will be merged
files = []
for f in fs.listStatus(hadoop.fs.Path(src_dir)):
if f.isFile():
files.append(f.getPath())
if not files:
raise ValueError("Source directory {} is empty".format(src_dir))
files.sort(key=lambda f: str(f))
# dst_permission = hadoop.fs.permission.FsPermission.valueOf(permission) # , permission='-rw-r-----'
out_stream = fs.create(hadoop.fs.Path(dst_file), overwrite)
try:
# loop over files in alphabetical order and append them one by one to the target file
for file in files:
if debug:
print("Appending file {} into {}".format(file, dst_file))
in_stream = fs.open(file) # InputStream object
try:
hadoop.io.IOUtils.copyBytes(in_stream, out_stream, conf, False) # False means don't close out_stream
finally:
in_stream.close()
finally:
out_stream.close()
if deleteSource:
fs.delete(hadoop.fs.Path(src_dir), True) # True=recursive
if debug:
print("Source directory {} removed.".format(src_dir))
copyMerge('/user/rdautkha/testdir', '/user/rdautkha/test_merge.txt', debug=True, overwrite=True, deleteSource=True)