Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scripts to convert Conventions and FAQ source files to HTML #33

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
.idea
.DS_Store
_site/
.sass-cache/
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,8 @@ This is the development repository for the website http://phoible.github.io/, wh

## About PHOIBLE
PHOIBLE is a database of phonological inventories and distinctive features, encompassing more than 1600 languages (and growing). PHOIBLE data is published in browsable form online at [PHOIBLE Online](http://phoible.org), which corresponds with the most recent year-numbered [release](https://github.com/phoible/phoible/releases) of the [development repository](https://github.com/phoible/phoible).

### Convert Conventions and FAQ source files to HTML
- Install package BS4 `pip install bs4`.
- Use `Sys.getenv("RSTUDIO_PANDOC")` in RStudio to find the RSTUDIO_PANDOC path. Edit the path in `convertMdToHTML.py RSTUDIO_PANDOC = "YOUR_PATH"`.
- Run `python convertMdToHTML.py`.
3 changes: 3 additions & 0 deletions _faq.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ output:
preserve_yaml: true
toc: true
toc_depth: 2
html_document:
toc: true
theme: default
csl: bib/phoible.csl
---

Expand Down
123 changes: 123 additions & 0 deletions convertMdToHTML.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import os
from os import path
import bs4
import re

# Edit the following path
RSTUDIO_PANDOC = '/Applications/RStudio.app/Contents/MacOS/pandoc'


def fix_FAQ(file_path, output_path):
div_content = None
with open(file_path) as f:
soup = bs4.BeautifulSoup(f.read(), 'html.parser')
# beautify tables
tables = soup.find_all('table')
ths = soup.find_all('th')
for table in tables:
table['cellpadding'] = '0'
table['cellspacing'] = '0'
table['border'] = '0'
table['class'] = 'table table-bordered order-column compact stripe dataTable no-footer table-nonfluid'
table['role'] = 'grid'
for th in ths:
th['role'] = 'row'

ps = soup.find_all('p')
# fix blockquotes
for p in ps:
if '>' in str(p):
p_temp = str(p).split('>')
p.clear()
p.string = p_temp[0].replace('<p>','')
blockquote = soup.new_tag('blockquote')
blockquote.append(bs4.BeautifulSoup(p_temp[1][0: len(p_temp) - 6], 'html.parser'))
p.append(blockquote)
# fix <em> Spacing
ems = soup.find_all('em')
for em in ems:
temp = str(em).replace('<em>', '').replace('</em>', '').strip()
em.clear()
em.append(bs4.BeautifulSoup(temp, 'html.parser'))
# fix references
references_div = soup.find('div', {'class':'references'})
if references_div is not None:
references_ps = references_div.find_all('p')
for p in references_ps:
# fix url
if 'Online: urlhttp' in str(p):
p_temp = p.get_text().split('Online: urlhttp')
p.string = p_temp[0]
a = soup.new_tag('a')
a.string = 'http' + p_temp[1]
a['href'] = a.string
p.append(a)
# fix spacing
if ' ,' in str(p):
p.string = re.sub(r' +,', ',', p.get_text())
if ' .' in str(p):
p.string = re.sub(r' +.', '.', p.get_text())
if p.get_text().endswith(':'):
p.string = p.get_text()[0 : len(p.get_text()) - 1] + '.'
# fix titles size
for level in list(range(5, 0, -1)):
tags = soup.find_all(f'h{level}')
for tag in tags:
tag.name = f'h{level + 1}'

div_content = soup.find('div', {'class': 'container-fluid main-container'})
with open(output_path, 'w') as file:
file.write(str(div_content))
# write scripts
with open('scripts.js') as f2:
file.write('\n')
file.write(f2.read())


def fix_conventions(file_path, output_path):
div_content = None
with open(file_path) as f:
soup = bs4.BeautifulSoup(f.read(), 'html.parser')
# beautify tables
tables = soup.find_all('table')
ths = soup.find_all('th')
for table in tables:
table['cellpadding'] = '0'
table['cellspacing'] = '0'
table['border'] = '0'
table['class'] = 'table table-bordered order-column compact stripe dataTable no-footer table-nonfluid'
table['role'] = 'grid'
for th in ths:
th['role'] = 'row'
tbodys = soup.find_all('tbody')
for tbody in tbodys:
counter = 1
for tr in tbody.find_all('tr'):
if counter % 2 == 0:
tr['class'] = 'even'
else:
tr['class'] = 'odd'
counter += 1
# fix titles size
for level in list(range(5, 0, -1)):
tags = soup.find_all(f'h{level}')
for tag in tags:
tag.name = f'h{level + 1}'

with open(output_path, 'w') as file:
file.write(str(soup))


def main():
print('Start kniting Rmd to HTML...')
print('File: _faq.Rmd')
os.system('Rscript --vanilla knitRmdToHTML.R _faq.Rmd ' + RSTUDIO_PANDOC)
fix_FAQ('_faq.html', 'faq_with_indexes.html')
print('File: conventions.rst')
os.system('rst2html5 conventions.rst conventions.html')
fix_conventions('conventions.html', 'conventions.html')
print('Converted! Output files: \033[94m faq_with_indexes.html conventions.html')


if __name__ == '__main__':
main()
10 changes: 10 additions & 0 deletions knitRmdToHTML.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
library(rmarkdown)

if (length(args) < 2) {
stop("Please provide arguments of (1)input .Rmd file (2)RStudio pandoc path.", call.=FALSE)
}
Sys.setenv(RSTUDIO_PANDOC = args[2])

render(args[1], 'html_document')
13 changes: 13 additions & 0 deletions scripts.js

Large diffs are not rendered by default.