Skip to content

Commit

Permalink
hdfs: copy: don't read everything into memory (#5277)
Browse files Browse the repository at this point in the history
Regression from #4973 and #4747
  • Loading branch information
efiop authored Jan 15, 2021
1 parent b027760 commit 273518a
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 3 deletions.
3 changes: 2 additions & 1 deletion dvc/tree/hdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
import re
import shutil
import subprocess
from collections import deque
from contextlib import closing, contextmanager
Expand Down Expand Up @@ -162,7 +163,7 @@ def copy(self, from_info, to_info, **_kwargs):
with closing(
hdfs.open_output_stream(tmp_info.path)
) as tmp_fobj:
tmp_fobj.write(from_fobj.read())
shutil.copyfileobj(from_fobj, tmp_fobj)
hdfs.move(tmp_info.path, to_info.path)
except Exception:
self.remove(tmp_info)
Expand Down
5 changes: 3 additions & 2 deletions dvc/tree/webhdfs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
import shutil
import threading
from contextlib import contextmanager
from urllib.parse import urlparse
Expand Down Expand Up @@ -134,8 +135,8 @@ def get_file_hash(self, path_info):

def copy(self, from_info, to_info, **_kwargs):
with self.hdfs_client.read(from_info.path) as reader:
content = reader.read()
self.hdfs_client.write(to_info.path, data=content)
with self.hdfs_client.write(to_info.path) as writer:
shutil.copyfileobj(reader, writer)

def move(self, from_info, to_info, mode=None):
self.hdfs_client.makedirs(to_info.parent.path)
Expand Down

0 comments on commit 273518a

Please sign in to comment.