Skip to content

Commit

Permalink
Merge pull request #27 from richfromm/file-not-found
Browse files Browse the repository at this point in the history
Add --ignore-file-not-found option to ignore HTTP not found errors downloading file attachments from Slack
  • Loading branch information
richfromm authored Feb 7, 2023
2 parents 0cc585a + ab37883 commit 2617f7e
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 14 deletions.
38 changes: 37 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,8 @@ following manners. This is the order that is searched:

Briefly, the script is executed via:

./slack2discord.py [--token TOKEN] [--server SERVER] [--create] [--downloads-dir DOWNLOADS_DIR] \
./slack2discord.py [--token TOKEN] [--server SERVER] [--create] \
[--users-file USERS_FILE] [--downloads-dir DOWNLOADS_DIR] [--ignore-file-not-found] \
[-v | --verbose] [-n | --dry-run] <src-and-dest-related-options>

The src and dest related options can be specified in one of three different
Expand All @@ -211,6 +212,41 @@ options, execute:

./slack2discord.py --help

## File attachments

As noted at the end of [Slack: How to read Slack data exports], files uploaded
to Slack are not directly part of a Slack export. Instead, the export contains
URLs that point to the locations of the files on Slack servers. These URLs
include a token that gives you the ability access those files. These tokens
are listed along with the exports on your Slack workspace's export page, which
can be found at `https://<workspace-name>.slack.com/services/export`

When running the script, such files will first be downloaded from Slack to a
local dir (which can be controlled with the `--downloads-dir` option), and
then uploaded to Discord. There are a number of reasons why this operation
could fail, and the files listed in the Slack export might not be found. These
include:

* The file was deleted from Slack after the export was performed

* The download token associated with that file export was revoked (this can be
done via the export page for the Slack workspace)

In these cases (and for any other HTTP errors related to downloading file
attachments from Slack), the default behavior is for the script to fail by
raising an HTTPError. This allows you to investigate the situation before
deciding how to proceed.

For the special case of HTTP Not Found errors, you can override this behavior,
and simply log the not found file as a warning, with the command line option
`--ignore-file-not-found`.

Note that if any files are deleted before the export is created, this state
will be reflected within the export (the file will have its `mode` set to
`tombstone`). Any such files will always be logged as warnings and ignored,
regardless of whether or not the ignore option is set for the HTTP Not found
case.

## Internals

The Discord Python API uses
Expand Down
1 change: 1 addition & 0 deletions slack2discord.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
downloader = SlackDownloader(
parsed_messages=parser.parsed_messages,
downloads_dir=config.downloads_dir,
ignore_not_found=config.ignore_file_not_found,
)
downloader.download()

Expand Down
13 changes: 12 additions & 1 deletion slack2discord/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
USAGE = dedent(
f"""
{argv[0]} [--token TOKEN] [--server SERVER] [--create] \\
[--users-file USERS_FILE] [--downloads-dir DOWNLOADS_DIR] \\
[--users-file USERS_FILE] [--downloads-dir DOWNLOADS_DIR] [--ignore-file-not-found] \\
[-v | --verbose] [-n | --dry-run] <src-and-dest-related-options>
src and dest related options must follow one of the following mutually exclusive formats:
Expand Down Expand Up @@ -219,6 +219,17 @@ def get_config(argv):
" execution, it is highly recommended that each execution of the script"
" use a unique directory.")

parser.add_argument('--ignore-file-not-found',
required=False,
action='store_true',
help="Ignore HTTP Not Found errors when downloading attached files. The"
" default behavior is to fail by raising an HTTPError on any HTTP errors"
" related to downloading file attachments from Slack. Use this option to"
" ignore errors of files not found, and log as warnings. This state can"
" occur if a user deletes a file from Slack after performing the export."
" Note that files deleted before the export are automatically logged as"
" warnings and ignored, regardless of this option.")

parser.add_argument('-v', '--verbose',
required=False,
action='store_true',
Expand Down
43 changes: 36 additions & 7 deletions slack2discord/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from os.path import dirname, exists, isdir, join, realpath
from time import time

from requests import get
from requests import codes, get

from .message import ParsedMessage, MessageFile

Expand All @@ -15,10 +15,14 @@ class SlackDownloader():
"""
Download a list of previously parsed files attached to Slack messages.
Once the files exist locally, they can then be uploaded to corresponding Discord messages.
Files not found by default raise an error.
This behavior can be overridden and they can be ignored with ignore_not_found.
"""
def __init__(self,
parsed_messages: dict,
downloads_dir: str = None):
downloads_dir: str = None,
ignore_not_found: bool = False):
# see SlackParser.parse() for details
self.parsed_messages = parsed_messages

Expand All @@ -41,6 +45,8 @@ def __init__(self,
# (if parsing includes any files)
self.downloads_dir = downloads_dir

self.ignore_not_found = ignore_not_found

self.files: list[MessageFile] = []

def _add_files(self, message: ParsedMessage) -> None:
Expand Down Expand Up @@ -77,12 +83,13 @@ def _populate_files(self) -> None:
for thread_message in thread.values():
self._add_files(thread_message)

def _wget(self, url, filename) -> None:
def _wget(self, url, filename, ignore_not_found = False) -> None:
"""
Fetch a file via HTTP GET from the given URL, and store it in the local filename.
Nothing is returned on success.
HTTP errors are raised as Exception's
Return True if we successfully downloaded the file
HTTP errors are in general raised as Exception's
If ignore_not_found is set, a not found error is allowed, and returns False
This is a simple implementation. We could instead stream data in chunks using
Response.iter_content:
Expand All @@ -103,10 +110,22 @@ def _wget(self, url, filename) -> None:
logger.warning(f"local filename already exists, will overwrite: {filename}")

with get(url) as req:
# Special case 404 errors, allowing user to ignore.
# All other HTTP errors raise an exception and fail.
if req.status_code == codes.not_found:
if ignore_not_found:
logger.warning(f"Not found error returned fetching {url} to {filename}, ignoring.")
return False
logger.error(f"Not found error returned fetching {url} to {filename}."
" You can ignore all of these with --ignore-file-not-found")
# intentional fall through, since we **do** want to raise the error next

req.raise_for_status()
with open(filename, 'wb') as file:
file.write(req.content)

return True

def download(self) -> None:
"""
Download all of the files from parsed messages to the downloads dir.
Expand All @@ -126,11 +145,21 @@ def download(self) -> None:
if not exists(self.downloads_dir):
makedirs(self.downloads_dir)

success = 0
not_found = 0
for file in self.files:
# using file.name would be more descriptive
# but that risks filename collisions
# we could place each file in its own dir, e.g. self.downloads_dir/file.id/file.name
# but that would be more awkward to work with
file.local_filename = join(self.downloads_dir, file.id)
self._wget(file.url, file.local_filename)
logger.info(f"Successfully downloaded {len(self.files)} files to {self.downloads_dir}")
if self._wget(file.url, file.local_filename, self.ignore_not_found):
success += 1
else:
file.not_found = True
not_found += 1

assert success + not_found == len(self.files)
logger.info(f"Successfully downloaded {success} files to {self.downloads_dir}")
if not_found > 0:
logger.warning(f"Ignored {not_found} files not found")
17 changes: 12 additions & 5 deletions slack2discord/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ def get_discord_add_files_args(self):
This can be passed via the method Messsage.add_files(*files), assuming the caller has
a Discord Message object
Any files which were not found and previously ignored are excluded
If there are no files, return None
For more details, see:
Expand All @@ -165,7 +167,8 @@ def get_discord_add_files_args(self):

return [discord.File(file.local_filename, # this is the actual file to upload
filename=file.name) # this is what Discord should call the file
for file in self.files]
for file in self.files
if not file.not_found] # exclude files not found


class MessageLink():
Expand Down Expand Up @@ -211,11 +214,15 @@ def __init__(self, id, name, url):
self.id = id # from slack
self.name = name
self.url = url # url_private in slack
# This will be set later, when the file is downloaded
# This will be set later, when the file is downloaded (successfully or not)
self.local_filename = None
# This is set in the special case of the file not found,
# which the user can optionally ignore.
self.not_found = False

def __repr__(self):
return (f"MessageFile(id='{self.id}',"
f" name='{self.name}'"
f" url='{self.url}'"
f" local_filename={ParsedMessage.str_or_none(self.local_filename)})")
f" name='{self.name}',"
f" url='{self.url}',"
f" local_filename={ParsedMessage.str_or_none(self.local_filename)}),"
f" not_found={self.not_found})")

0 comments on commit 2617f7e

Please sign in to comment.