hydrogram/hydrogram/parser/markdown.py at stable · hydrogram/hydrogram

History

179 lines (152 loc) · 5.79 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

# Hydrogram - Telegram MTProto API Client Library for Python

# This file is part of Hydrogram.

# Hydrogram is free software: you can redistribute it and/or modify

# it under the terms of the GNU Lesser General Public License as published

# by the Free Software Foundation, either version 3 of the License, or

# (at your option) any later version.

# Hydrogram is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License

# along with Hydrogram. If not, see <http://www.gnu.org/licenses/>.

import html

import re

from typing import TYPE_CHECKING, Optional

from hydrogram.enums import MessageEntityType

from . import utils

from .html import HTML

if TYPE_CHECKING:

import hydrogram

BOLD_DELIM = "**"

ITALIC_DELIM = "__"

UNDERLINE_DELIM = "--"

STRIKE_DELIM = "~~"

SPOILER_DELIM = "||"

CODE_DELIM = "`"

PRE_DELIM = "```"

MARKDOWN_RE = re.compile(

r"({d})|\[(.+?)\]\((.+?)\)".format(

d="|".join([

"".join(i)

for i in [

[rf"\{j}" for j in i]

for i in [

PRE_DELIM,

CODE_DELIM,

STRIKE_DELIM,

UNDERLINE_DELIM,

ITALIC_DELIM,

BOLD_DELIM,

SPOILER_DELIM,

]

])

)

OPENING_TAG = "<{}>"

CLOSING_TAG = "</{}>"

URL_MARKUP = '<a href="{}">{}</a>'

FIXED_WIDTH_DELIMS = [CODE_DELIM, PRE_DELIM]

class Markdown:

def __init__(self, client: Optional["hydrogram.Client"]):

self.html = HTML(client)

async def parse(self, text: str, strict: bool = False):

if strict:

text = html.escape(text)

delims = set()

is_fixed_width = False

for i, match in enumerate(re.finditer(MARKDOWN_RE, text)):

start, _ = match.span()

delim, text_url, url = match.groups()

full = match.group(0)

if delim in FIXED_WIDTH_DELIMS:

is_fixed_width = not is_fixed_width

if is_fixed_width and delim not in FIXED_WIDTH_DELIMS:

continue

if text_url:

text = utils.replace_once(text, full, URL_MARKUP.format(url, text_url), start)

continue

if delim == BOLD_DELIM:

tag = "b"

elif delim == ITALIC_DELIM:

tag = "i"

elif delim == UNDERLINE_DELIM:

tag = "u"

elif delim == STRIKE_DELIM:

tag = "s"

elif delim == CODE_DELIM:

tag = "code"

elif delim == PRE_DELIM:

tag = "pre"

elif delim == SPOILER_DELIM:

tag = "spoiler"

else:

continue

if delim not in delims:

delims.add(delim)

tag = OPENING_TAG.format(tag)

else:

delims.remove(delim)

tag = CLOSING_TAG.format(tag)

if delim == PRE_DELIM and delim in delims:

delim_and_language = text[text.find(PRE_DELIM) :].split("\n")[0]

language = delim_and_language[len(PRE_DELIM) :]

text = utils.replace_once(

text, delim_and_language, f'<pre language="{language}">', start

)

continue

text = utils.replace_once(text, delim, tag, start)

return await self.html.parse(text)

@staticmethod

def unparse(text: str, entities: list):

text = utils.add_surrogates(text)

entities_offsets = []

for entity in entities:

entity_type = entity.type

start = entity.offset

end = start + entity.length

if entity_type == MessageEntityType.BOLD:

start_tag = end_tag = BOLD_DELIM

elif entity_type == MessageEntityType.ITALIC:

start_tag = end_tag = ITALIC_DELIM

elif entity_type == MessageEntityType.UNDERLINE:

start_tag = end_tag = UNDERLINE_DELIM

elif entity_type == MessageEntityType.STRIKETHROUGH:

start_tag = end_tag = STRIKE_DELIM

elif entity_type == MessageEntityType.CODE:

start_tag = end_tag = CODE_DELIM

elif entity_type == MessageEntityType.PRE:

language = getattr(entity, "language", "") or ""

start_tag = f"{PRE_DELIM}{language}\n"

end_tag = f"\n{PRE_DELIM}"

elif entity_type == MessageEntityType.BLOCKQUOTE:

start_tag = end_tag = PRE_DELIM

elif entity_type == MessageEntityType.SPOILER:

start_tag = end_tag = SPOILER_DELIM

elif entity_type == MessageEntityType.TEXT_LINK:

url = entity.url

start_tag = "["

end_tag = f"]({url})"

elif entity_type == MessageEntityType.TEXT_MENTION:

user = entity.user

start_tag = "["

end_tag = f"](tg://user?id={user.id})"

else:

continue

entities_offsets.extend(((start_tag, start), (end_tag, end)))

entities_offsets = (

x[1]

for x in sorted(

enumerate(entities_offsets), key=lambda x: (x[1][1], x[0]), reverse=True

)

for entity, offset in entities_offsets:

text = text[:offset] + entity + text[offset:]

return utils.remove_surrogates(text)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

FilesExpand file tree

markdown.py

Latest commit

History

markdown.py

File metadata and controls