Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve download file handling #737

Merged
merged 2 commits into from
Feb 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions alembic/versions/fb657f2ee8a7_drop_file_original_filename.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""drop File.original_filename

Revision ID: fb657f2ee8a7
Revises: 86b01b6290da
Create Date: 2020-01-23 18:55:09.857324

"""
from alembic import op
import sqlalchemy as sa

# revision identifiers, used by Alembic.
revision = "fb657f2ee8a7"
down_revision = "86b01b6290da"
branch_labels = None
depends_on = None


def upgrade():
conn = op.get_bind()

op.rename_table("files", "original_files")

conn.execute(
"""
CREATE TABLE files (
id INTEGER NOT NULL,
uuid VARCHAR(36) NOT NULL,
filename VARCHAR(255) NOT NULL,
file_counter INTEGER NOT NULL,
size INTEGER NOT NULL,
download_url VARCHAR(255) NOT NULL,
is_downloaded BOOLEAN DEFAULT 0 NOT NULL,
is_read BOOLEAN DEFAULT 0 NOT NULL,
is_decrypted BOOLEAN,
source_id INTEGER NOT NULL,
CONSTRAINT pk_files PRIMARY KEY (id),
CONSTRAINT fk_files_source_id_sources FOREIGN KEY(source_id) REFERENCES sources (id),
CONSTRAINT uq_messages_source_id_file_counter UNIQUE (source_id, file_counter),
CONSTRAINT uq_files_uuid UNIQUE (uuid),
CONSTRAINT files_compare_is_downloaded_vs_is_decrypted CHECK (CASE WHEN is_downloaded = 0 THEN is_decrypted IS NULL ELSE 1 END),
CONSTRAINT ck_files_is_downloaded CHECK (is_downloaded IN (0, 1)),
CONSTRAINT ck_files_is_read CHECK (is_read IN (0, 1)),
CONSTRAINT ck_files_is_decrypted CHECK (is_decrypted IN (0, 1))
)
"""
)

conn.execute(
"""
INSERT INTO files
(id, uuid, filename, file_counter, size, download_url, is_downloaded,
is_decrypted, is_read, source_id)
SELECT id, uuid, filename, file_counter, size, download_url, is_downloaded,
is_decrypted, is_read, source_id
FROM original_files
"""
)

op.drop_table("original_files")


def downgrade():
op.add_column(
"files",
sa.Column(
"original_filename",
sa.VARCHAR(length=255),
server_default=sa.text("''"),
nullable=False,
),
)
61 changes: 43 additions & 18 deletions securedrop_client/api_jobs/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,14 @@ def call_api(self, api_client: API, session: Session) -> Any:
self._decrypt(os.path.join(self.data_dir, db_object.filename), db_object, session)
return db_object.uuid

self._download(api_client, db_object, session)
self._decrypt(os.path.join(self.data_dir, db_object.filename), db_object, session)
destination = self._download(api_client, db_object, session)
self._decrypt(destination, db_object, session)
return db_object.uuid

def _download(self,
api: API,
db_object: Union[File, Message, Reply],
session: Session) -> None:
session: Session) -> str:
'''
Download the encrypted file. Check file integrity and move it to the data directory before
marking it as downloaded.
Expand All @@ -197,9 +197,12 @@ def _download(self,
)
raise exception

shutil.move(download_path, os.path.join(self.data_dir, db_object.filename))
destination = db_object.location(self.data_dir)
os.makedirs(os.path.dirname(destination), mode=0o700, exist_ok=True)
shutil.move(download_path, destination)
mark_as_downloaded(type(db_object), db_object.uuid, session)
logger.info("File downloaded: {}".format(db_object.filename))
logger.info("File downloaded to {}".format(destination))
return destination
except BaseError as e:
logger.debug("Failed to download file: {}".format(db_object.filename))
raise e
Expand All @@ -216,7 +219,9 @@ def _decrypt(self,
mark_as_decrypted(
type(db_object), db_object.uuid, session, original_filename=original_filename
)
logger.info("File decrypted: {}".format(os.path.basename(filepath)))
logger.info("File decrypted: {} (decrypted file: {})".format(
os.path.basename(filepath), original_filename)
)
except CryptoError as e:
mark_as_decrypted(type(db_object), db_object.uuid, session, is_decrypted=False)
logger.debug("Failed to decrypt file: {}".format(os.path.basename(filepath)))
Expand Down Expand Up @@ -293,11 +298,21 @@ def call_decrypt(self, filepath: str, session: Session = None) -> str:
'''
with NamedTemporaryFile('w+') as plaintext_file:
self.gpg.decrypt_submission_or_reply(filepath, plaintext_file.name, is_doc=False)
set_message_or_reply_content(
model_type=Reply,
uuid=self.uuid,
session=session,
content=plaintext_file.read())
try:
set_message_or_reply_content(
model_type=Reply,
uuid=self.uuid,
session=session,
content=plaintext_file.read())
finally:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can remove this cleanup since we use a temporary file that automatically gets removed (see how when you remove this block of code, there is no decryption file?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The temporary file is gone, but because I'm doing the work in subdirectories under data_dir, instead of at the top level, the folder named after the UUID of the message or reply will be left behind, empty. This cleanup keeps those from accumulating and possibly affecting performance.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah i see this makes sense.

just out of curiosity, why did you chose not to decrypt to the same uuid directory, e.g. instead of:

Message/baf388ff-dfbd-45e1-a629-bf9e761b1ac8
Reply/baf388ff-dfbd-45e1-a629-bf9e761b1ac8
File/baf388ff-dfbd-45e1-a629-bf9e761b1ac8/example_submission.pdf

why not:

baf388ff-dfbd-45e1-a629-bf9e761b1ac8/example_submission.pdf
baf388ff-dfbd-45e1-a629-bf9e761b1ac8/<temporary message and reply files that are removed>

Copy link
Contributor

@sssoleileraaa sssoleileraaa Jan 28, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah maybe it's to avoid having an empty uuid directory if there are only messages and replies and not files?

but i guess you could just decrypt them to the data dir and then there wouldn't need to be any folder creation and deletion cleanup. I prefer this way because it keeps the code simpler, so to be clear we would just have these in the data directory:

baf388ff-dfbd-45e1-a629-bf9e761b1ac8/example_submission.pdf
<temporary message and reply files downloded with the server file name `<count>-<jojurnalist designation>-<type of file>` removed once decrypted>

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason for including the class in the path is that UUIDs are only unique within a table, so if we were writing them into the root of the data directory, we could theoretically see collisions. I know, I know, UUIDs, but this approach uses the database constraints to guarantee it can't happen.

Copy link
Contributor

@sssoleileraaa sssoleileraaa Jan 29, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To recap our discussion from standup this morning, there were generally three ideas kicking around (we went with the third idea):

  1. keep storing downloaded messages and replies in the data directory and if they fail to decrypt, that's where we'll find them, and store submission files in the data directory until they are successfully decrypted into a directory with a unique name, either UUID or the server filename.

    Pros:

    • If messages and replies successfully decrypt, we don't have to delete any folders

    Cons:

    • files are not organized under a journalist designation, so files that are part of the same conversation would have to be collected in various places if we want to one day export multiple files and messages.

    • maintaining two different data directory structures in securedrop-export and securedrop-client, more cognitive overload

  2. store all messages and replies in their respective <Type>/<UUID> directories

    Pros:

    • the UUID never changes, so we don't have to change the name of the directory when the journalist designation changes

    • if for some reason the journalist designation name change fails when we update the filesystem, we can still easily delete messages and files easily because they will be organized under the UUID (this is relevant removing a source or individual submissions)

    Cons:

    • (same cons as above, also a note about how we already have to change the filename if the journalist designation changes)
  3. maintain the same data directory structure as securedrop-export (see https://github.com/freedomofpress/securedrop-export/blob/master/README.md#export-archive-format)

    Pros:

    • same directory structure as we (plan to) use in the securedrop-export, less to remember, and if we find a problem with the structure we can easily fix it in both places

    • files are organized under a journalist designation, so files that are part of the same conversation would be easier to collect for a multi-file export or a full-source conversation export

    • it's more human readable

    Cons:

    • We have to change the name of the folders in addition to the files if the journalist designation changes

One additonal thought I'm having now is that we might want to consider storing messages directly under the journalist designation name, instead of create a subdirectory of the server filename, since they will always be text files and have the same server filename as the subdirecotry.

So instead of:

├── cytotoxic payer
│   ├── 1-cytotoxic-payer-msg
│   │   └── 1-cytotoxic-payer-msg.txt
│   ├── 2-cytotoxic-payer-msg
│   │   └── 2-cytotoxic-payer-msg.txt
│   └── 3-cytotoxic-payer-doc
│   │   └── original-filename.doc
├── grandiloquent pasteboard
│   └── 1-grandiloquent-pasteboard-doc
│   │   └── original-filename.pdf
└── snug seek

I'm proposing:

├── cytotoxic payer
│   ├── 1-cytotoxic-payer-msg.txt
│   ├── 2-cytotoxic-payer-msg.txt
│   └── 3-cytotoxic-payer-doc
│   │   └── original-filename.doc
├── grandiloquent pasteboard
│   └── 1-grandiloquent-pasteboard-doc
│   │   └── original-filename.pdf
└── snug seek

But this is a minor detail, and shouldn't hold up this PR since this has yet to be implemented in securdrop-export.

Ok, that's the recap, plus one additional thought!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One of the requirements in #714 was to remove File.original_filename, instead replacing File.filename with the submission's original filename when the submission was unpacked. That's going to make the hierarchy above impossible, since we won't have 1-grandiloquent-pasteboard-doc in the File once it's been processed.

Keeping the original_filename column would enable this structure.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok so we discussed this a bit and @creviera is gonna pick up freedomofpress/securedrop#4304 to remove the renaming of journalist_designation entirely, we'll pick into the next core/server minor release. One thing we thought about when we discussed this is that the metadata sync (and file rename) and file/message/reply download can occur concurrently, so it's definitely worth removing the rename to avoid racing behavior (i.e. to avoid guarding against such behavior). This way we can just use the securedrop-export structure everywhere.

so: for the remainder of this review, we can assume that this will be tested with a version of securedrop server that lacks journalist_designation renaming

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and for those not part of the discussion today, we agreed that it makes sense to stick with idea #3 (see recap of all the three ideas we discussed and their pros and cons from Wednesday's standup: #737 (comment))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(ignore the #3 link... forgot that would link to an issue)

# clean up directory where decryption happened
try:
os.rmdir(os.path.dirname(filepath))
except Exception as e:
logger.warning(
"Error deleting decryption directory of message %s: %s", self.uuid, e
)

return ""


Expand Down Expand Up @@ -339,12 +354,21 @@ def call_decrypt(self, filepath: str, session: Session = None) -> str:
The return value is an empty string; messages have no original filename.
'''
with NamedTemporaryFile('w+') as plaintext_file:
self.gpg.decrypt_submission_or_reply(filepath, plaintext_file.name, is_doc=False)
set_message_or_reply_content(
model_type=Message,
uuid=self.uuid,
session=session,
content=plaintext_file.read())
try:
self.gpg.decrypt_submission_or_reply(filepath, plaintext_file.name, is_doc=False)
set_message_or_reply_content(
model_type=Message,
uuid=self.uuid,
session=session,
content=plaintext_file.read())
finally:
rmol marked this conversation as resolved.
Show resolved Hide resolved
# clean up directory where decryption happened
try:
os.rmdir(os.path.dirname(filepath))
except Exception as e:
logger.warning(
"Error deleting decryption directory of message %s: %s", self.uuid, e
)
return ""


Expand Down Expand Up @@ -385,8 +409,9 @@ def call_decrypt(self, filepath: str, session: Session = None) -> str:
the file extensions, e.g. 1-impractical_thing-doc.gz.gpg -> 1-impractical_thing-doc
'''
fn_no_ext, _ = os.path.splitext(os.path.splitext(os.path.basename(filepath))[0])
plaintext_filepath = os.path.join(self.data_dir, fn_no_ext)
plaintext_filepath = os.path.join(os.path.dirname(filepath), fn_no_ext)
rmol marked this conversation as resolved.
Show resolved Hide resolved
original_filename = self.gpg.decrypt_submission_or_reply(
filepath, plaintext_filepath, is_doc=True
)
logger.info("""Decrypted file "%s" to "%s" """, filepath, original_filename)
return original_filename
8 changes: 6 additions & 2 deletions securedrop_client/crypto.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,12 @@ def decrypt_submission_or_reply(self,

# Store the plaintext in the file located at the specified plaintext_filepath
if is_doc:
original_filename = read_gzip_header_filename(out.name)
with gzip.open(out.name, 'rb') as infile, open(plaintext_filepath, 'wb') as outfile:
original_filename = read_gzip_header_filename(out.name) or plaintext_filepath
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since you're using the filepath as the default filename here, in case a file name is missing from the header, i think you'll have to clean up our code elsewhere that has logic to use the server filename, or perhaps you did that? i haven't come across it yet or maybe i missed it.

decrypt_path = os.path.join(
os.path.dirname(filepath),
os.path.basename(original_filename)
)
with gzip.open(out.name, 'rb') as infile, open(decrypt_path, 'wb') as outfile:
shutil.copyfileobj(infile, outfile)
else:
shutil.copy(out.name, plaintext_filepath)
Expand Down
54 changes: 44 additions & 10 deletions securedrop_client/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ def collection(self) -> List:
datetime.datetime(datetime.MINYEAR, 1, 1))))
return collection

@property
def journalist_filename(self) -> str:
valid_chars = 'abcdefghijklmnopqrstuvwxyz1234567890-_'
return ''.join([c for c in self.journalist_designation.lower().replace(
' ', '_') if c in valid_chars])


class Message(Base):

Expand Down Expand Up @@ -123,6 +129,18 @@ def __str__(self) -> str:
def __repr__(self) -> str:
return '<Message {}>'.format(self.filename)

def location(self, data_dir: str) -> str:
'''
Return the full path to the Message's file.
'''
return os.path.abspath(
os.path.join(
data_dir,
self.source.journalist_filename,
os.path.splitext(self.filename)[0] + '.txt'
)
)


class File(Base):

Expand All @@ -135,15 +153,6 @@ class File(Base):
uuid = Column(String(36), unique=True, nullable=False)
filename = Column(String(255), nullable=False)

# Files from the SecureDrop journalist API are gzipped, then
# encrypted with GPG. The gzip header contains the original
# filename, which makes it easier for the client to open the file
# with the right application. We'll record that filename here
# after we've downloaded, decrypted and extracted the file.
# If the header does not contain the filename for some reason,
# this should be the same as filename.
original_filename = Column(String(255), nullable=False, server_default="")

file_counter = Column(Integer, nullable=False)
size = Column(Integer, nullable=False)
download_url = Column(String(255), nullable=False)
Expand Down Expand Up @@ -179,13 +188,26 @@ def __str__(self) -> str:
Return something that's a useful string representation of the file.
"""
if self.is_downloaded:
return "File: {}".format(self.original_filename)
return "File: {}".format(self.filename)
else:
return '<Encrypted file on server>'

def __repr__(self) -> str:
return '<File {}>'.format(self.filename)

def location(self, data_dir: str) -> str:
'''
Return the full path to the File's file.
'''
return os.path.abspath(
os.path.join(
data_dir,
self.source.journalist_filename,
'{}-{}-doc'.format(self.file_counter, self.source.journalist_filename),
self.filename
)
)


class Reply(Base):

Expand Down Expand Up @@ -246,6 +268,18 @@ def __str__(self) -> str:
def __repr__(self) -> str:
return '<Reply {}>'.format(self.filename)

def location(self, data_dir: str) -> str:
'''
Return the full path to the Reply's file.
'''
return os.path.abspath(
os.path.join(
data_dir,
self.source.journalist_filename,
os.path.splitext(self.filename)[0] + '.txt'
)
)


class DraftReply(Base):

Expand Down
13 changes: 7 additions & 6 deletions securedrop_client/gui/widgets.py
Original file line number Diff line number Diff line change
Expand Up @@ -1897,7 +1897,7 @@ def __init__(
self.print_button.clicked.connect(self._on_print_clicked)

# File name or default string
self.file_name = SecureQLabel(self.file.original_filename)
self.file_name = SecureQLabel(self.file.filename)
self.file_name.setObjectName('file_name')
self.file_name.installEventFilter(self)
self.no_file_name = SecureQLabel('ENCRYPTED FILE ON SERVER')
Expand Down Expand Up @@ -1952,7 +1952,7 @@ def _on_file_downloaded(self, file_uuid: str) -> None:
if file_uuid == self.file.uuid:
self.file = self.controller.get_file(self.file.uuid)
if self.file.is_downloaded:
self.file_name.setText(self.file.original_filename)
self.file_name.setText(self.file.filename)
self.download_button.hide()
self.no_file_name.hide()
self.export_button.show()
Expand Down Expand Up @@ -1983,10 +1983,11 @@ def _on_export_clicked(self):
"""
Called when the export button is clicked.
"""
if not self.controller.downloaded_file_exists(self.file.uuid):
if not self.controller.downloaded_file_exists(self.file):
return

dialog = ExportDialog(self.controller, self.file.uuid, self.file.original_filename)
dialog = ExportDialog(self.controller, self.file.uuid,
self.file.filename)
dialog.show()
dialog.export()
dialog.exec()
Expand All @@ -1996,7 +1997,7 @@ def _on_print_clicked(self):
"""
Called when the print button is clicked.
"""
if not self.controller.downloaded_file_exists(self.file.uuid):
if not self.controller.downloaded_file_exists(self.file):
return

dialog = PrintDialog(self.controller, self.file.uuid)
Expand All @@ -2014,7 +2015,7 @@ def _on_left_click(self):

if self.file.is_downloaded:
# Open the already downloaded file.
self.controller.on_file_open(self.file.uuid)
self.controller.on_file_open(self.file)
else:
if self.controller.api:
# Indicate in downloading state... but only after 0.3 seconds (i.e.
Expand Down
Loading