Skip to content

Commit

Permalink
add google docs to preprocessing options
Browse files Browse the repository at this point in the history
  • Loading branch information
ajparsons authored Mar 30, 2022
1 parent cd51874 commit 85bc293
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 3 deletions.
7 changes: 5 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,10 @@ xlwt==1.3.0
openpyxl==2.6.4
xlrd==1.2.0
PyMuPDF==1.18.16
black==22.1.0
black==22.3.0
pylint==1.4.0
rich==12.0.1
cookiecutter==1.7.3
cookiecutter==1.7.3
google-api-python-client==2.42.0
oauth2client==4.1.3
python-unsplash==2.27.1
76 changes: 76 additions & 0 deletions stringprint/tools/google_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import io
import json
import os
from pathlib import Path
from typing import Optional, Union

from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from oauth2client.service_account import ServiceAccountCredentials


class GoogleDownloader:

mime_types = {
".pdf": "application/pdf",
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
}

def __init__(self):

data = os.environ.get("GDRIVE_SERVICE_USER", None)
if data is None:
raise ValueError("GDRIVE_SERVICE_USER env var not set.")

credentials = json.loads(data)

credentials = ServiceAccountCredentials.from_json_keyfile_dict(
credentials,
[
"https://spreadsheets.google.com/feeds",
"https://www.googleapis.com/auth/drive",
],
)
self.drive_service = build("drive", "v3", credentials=credentials)

def download_file(
self,
*,
dest: Union[str, Path],
google_id: Optional[str] = None,
url: Optional[str] = None,
mime_type: Optional[str] = None,
):
"""
url takes precidence over google_id
if mime_type not specified, will try and work out from destination extention
"""

if not (google_id or url):
raise ValueError("Must specify google id or url of file to download")

if url:
# assuming the edit view
google_id = url.split("/")[-2]

if isinstance(dest, str):
dest = Path(dest)

if mime_type is None:
mime_type = self.__class__.mime_types.get(dest.suffix, None)

if mime_type is None:
raise ValueError(f"Can't extract a mimetype from {dest.suffix}")

request = self.drive_service.files().export_media(
fileId=google_id, mimeType=mime_type
)
fh = io.FileIO(dest, "wb")
downloader = MediaIoBaseDownload(fh, request)
done = False
print(f"waiting for download of {dest.name} start")
while done is False:
status, done = downloader.next_chunk()
print(f"Download {int(status.progress() * 100)}%.")
print("Download Finished.")
30 changes: 29 additions & 1 deletion stringprint/tools/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
from __future__ import annotations

import shutil
from dataclasses import dataclass
from pathlib import Path
from typing import Callable, List, Optional, Tuple, TypeVar, Union, Dict
from typing import Callable, Dict, List, Optional, Tuple, TypeVar, Union

from PyPDF2 import PdfFileReader, PdfFileWriter
from stringprint.models import Article
from stringprint.tools.word_convert import convert_word
from useful_inkleby.files import QuickText

from .google_download import GoogleDownloader


def merge_pdfs(
front: Path,
Expand Down Expand Up @@ -172,6 +175,8 @@ def process_text(self, qt: QuickText) -> QuickText:


class BasePreprocess:
google_download: bool = True
google_download_formats: list = ["contents.pdf", "doc_source.docx"]
word_source_file: Optional[str] = None
word_demote: bool = False # push everyone down one header level
word_convert: bool = True
Expand Down Expand Up @@ -201,6 +206,7 @@ def process(self) -> None:
"""
Run standard features
"""
self.do_google_download()
self.do_convert_word()
self.import_markdown()
self.combine_pdfs()
Expand All @@ -210,6 +216,18 @@ def process(self) -> None:
self.postprocess()
self.output()

def do_google_download(self):
""" """
if (
self.__class__.google_download
and "google_doc_url" in self.article.extra_values
):
google_doc_url = self.article.extra_values["google_doc_url"]
gd = GoogleDownloader()
for f in self.__class__.google_download_formats:
dest = self.doc_folder / f
gd.download_file(dest=dest, url=google_doc_url)

def import_markdown(self):
"""
Import the source file as a QuickText object
Expand Down Expand Up @@ -237,6 +255,16 @@ def combine_pdfs(self) -> None:
filename = self.pdf_output_filename

output = self.doc_folder / filename

if not (any([front_page.exists(), contents.exists()])):
print("neither source files for merging pdf, skipping")
return

if contents.exists() and not front_page.exists():
print("no front page, just reusing contents page")
shutil.copy(contents, output)
return

merge_pdfs(front_page, contents, output, start_page)
print("combined pdfs")

Expand Down

0 comments on commit 85bc293

Please sign in to comment.