Commit 8fd662f0 authored by Georges Khaznadar's avatar Georges Khaznadar ok

parent 8a9d8804
This is a parser and a web automaton to browse
and extract every page sources, then convert them to RST files in a way
which allows to convert the pages translated to Spanish into a usable tree
to build the User Manual of Eyes17
import subprocess, re, os
from bs4 import BeautifulSoup
def getSoup(url, data=""):
gets a BeautifulSoup object from an url
@param url something like ''
@param data a string of parameters to be added, without single quotes
@result a BeautifulSoup digest
p=subprocess.Popen(f"curl --silent {url} -d '{data}'", shell=True,
text, _ = p.communicate()
return BeautifulSoup(text,'html.parser')
def doku2rst(text, name, directory="output"):
Converts a text from dokuwiki format to RST format
and write it to a file under some directory
@param text string in HTML format from a dokuwiki
@param name name of the new file
@param directory a path to write to
os.makedirs(directory, exist_ok=True)
with open(os.path.join(directory, name),"wt") as outfile:
for l in lines:
if "> Editar y traducir esta página" in l:
if "<code>" in l:
if "</code>" in l:
if __name__=="__main__":
soup=getSoup("", data="id=manual:fuentes_del_manual_eyes-17")
links=soup.find_all("a", href=re.compile(r".*\.rst"))
for l in links:
print("===================",l.text, f"{l['href'].replace('/doku.php?','')}")
soup=getSoup(f"", data=f"{l['href'].replace('/doku.php?','')}&do=edit")
doku2rst(text, l.text)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment