197 lines
		
	
	
		
			6.9 KiB
		
	
	
	
		
			Python
		
	
	
			
		
		
	
	
			197 lines
		
	
	
		
			6.9 KiB
		
	
	
	
		
			Python
		
	
	
| # Copyright 2021 The Matrix.org Foundation C.I.C.
 | |
| #
 | |
| # Licensed under the Apache License, Version 2.0 (the "License");
 | |
| # you may not use this file except in compliance with the License.
 | |
| # You may obtain a copy of the License at
 | |
| #
 | |
| #     http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # Unless required by applicable law or agreed to in writing, software
 | |
| # distributed under the License is distributed on an "AS IS" BASIS,
 | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| # See the License for the specific language governing permissions and
 | |
| # limitations under the License.
 | |
| import json
 | |
| import re
 | |
| from typing import Any, Dict, Iterable, List, Optional, Pattern
 | |
| from urllib import parse as urlparse
 | |
| 
 | |
| import attr
 | |
| import pkg_resources
 | |
| 
 | |
| from synapse.types import JsonDict
 | |
| 
 | |
| from ._base import Config, ConfigError
 | |
| from ._util import validate_config
 | |
| 
 | |
| 
 | |
| @attr.s(slots=True, frozen=True, auto_attribs=True)
 | |
| class OEmbedEndpointConfig:
 | |
|     # The API endpoint to fetch.
 | |
|     api_endpoint: str
 | |
|     # The patterns to match.
 | |
|     url_patterns: List[Pattern]
 | |
|     # The supported formats.
 | |
|     formats: Optional[List[str]]
 | |
| 
 | |
| 
 | |
| class OembedConfig(Config):
 | |
|     """oEmbed Configuration"""
 | |
| 
 | |
|     section = "oembed"
 | |
| 
 | |
|     def read_config(self, config, **kwargs):
 | |
|         oembed_config: Dict[str, Any] = config.get("oembed") or {}
 | |
| 
 | |
|         # A list of patterns which will be used.
 | |
|         self.oembed_patterns: List[OEmbedEndpointConfig] = list(
 | |
|             self._parse_and_validate_providers(oembed_config)
 | |
|         )
 | |
| 
 | |
|     def _parse_and_validate_providers(
 | |
|         self, oembed_config: dict
 | |
|     ) -> Iterable[OEmbedEndpointConfig]:
 | |
|         """Extract and parse the oEmbed providers from the given JSON file.
 | |
| 
 | |
|         Returns a generator which yields the OidcProviderConfig objects
 | |
|         """
 | |
|         # Whether to use the packaged providers.json file.
 | |
|         if not oembed_config.get("disable_default_providers") or False:
 | |
|             providers = json.load(
 | |
|                 pkg_resources.resource_stream("synapse", "res/providers.json")
 | |
|             )
 | |
|             yield from self._parse_and_validate_provider(
 | |
|                 providers, config_path=("oembed",)
 | |
|             )
 | |
| 
 | |
|         # The JSON files which includes additional provider information.
 | |
|         for i, file in enumerate(oembed_config.get("additional_providers") or []):
 | |
|             # TODO Error checking.
 | |
|             with open(file) as f:
 | |
|                 providers = json.load(f)
 | |
| 
 | |
|             yield from self._parse_and_validate_provider(
 | |
|                 providers,
 | |
|                 config_path=(
 | |
|                     "oembed",
 | |
|                     "additional_providers",
 | |
|                     f"<item {i}>",
 | |
|                 ),
 | |
|             )
 | |
| 
 | |
|     def _parse_and_validate_provider(
 | |
|         self, providers: List[JsonDict], config_path: Iterable[str]
 | |
|     ) -> Iterable[OEmbedEndpointConfig]:
 | |
|         # Ensure it is the proper form.
 | |
|         validate_config(
 | |
|             _OEMBED_PROVIDER_SCHEMA,
 | |
|             providers,
 | |
|             config_path=config_path,
 | |
|         )
 | |
| 
 | |
|         # Parse it and yield each result.
 | |
|         for provider in providers:
 | |
|             # Each provider might have multiple API endpoints, each which
 | |
|             # might have multiple patterns to match.
 | |
|             for endpoint in provider["endpoints"]:
 | |
|                 api_endpoint = endpoint["url"]
 | |
| 
 | |
|                 # The API endpoint must be an HTTP(S) URL.
 | |
|                 results = urlparse.urlparse(api_endpoint)
 | |
|                 if results.scheme not in {"http", "https"}:
 | |
|                     raise ConfigError(
 | |
|                         f"Unsupported oEmbed scheme ({results.scheme}) for endpoint {api_endpoint}",
 | |
|                         config_path,
 | |
|                     )
 | |
| 
 | |
|                 patterns = [
 | |
|                     self._glob_to_pattern(glob, config_path)
 | |
|                     for glob in endpoint["schemes"]
 | |
|                 ]
 | |
|                 yield OEmbedEndpointConfig(
 | |
|                     api_endpoint, patterns, endpoint.get("formats")
 | |
|                 )
 | |
| 
 | |
|     def _glob_to_pattern(self, glob: str, config_path: Iterable[str]) -> Pattern:
 | |
|         """
 | |
|         Convert the glob into a sane regular expression to match against. The
 | |
|         rules followed will be slightly different for the domain portion vs.
 | |
|         the rest.
 | |
| 
 | |
|         1. The scheme must be one of HTTP / HTTPS (and have no globs).
 | |
|         2. The domain can have globs, but we limit it to characters that can
 | |
|            reasonably be a domain part.
 | |
|            TODO: This does not attempt to handle Unicode domain names.
 | |
|            TODO: The domain should not allow wildcard TLDs.
 | |
|         3. Other parts allow a glob to be any one, or more, characters.
 | |
|         """
 | |
|         results = urlparse.urlparse(glob)
 | |
| 
 | |
|         # The scheme must be HTTP(S) (and cannot contain wildcards).
 | |
|         if results.scheme not in {"http", "https"}:
 | |
|             raise ConfigError(
 | |
|                 f"Unsupported oEmbed scheme ({results.scheme}) for pattern: {glob}",
 | |
|                 config_path,
 | |
|             )
 | |
| 
 | |
|         pattern = urlparse.urlunparse(
 | |
|             [
 | |
|                 results.scheme,
 | |
|                 re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"),
 | |
|             ]
 | |
|             + [re.escape(part).replace("\\*", ".+") for part in results[2:]]
 | |
|         )
 | |
|         return re.compile(pattern)
 | |
| 
 | |
|     def generate_config_section(self, **kwargs):
 | |
|         return """\
 | |
|         # oEmbed allows for easier embedding content from a website. It can be
 | |
|         # used for generating URLs previews of services which support it.
 | |
|         #
 | |
|         oembed:
 | |
|           # A default list of oEmbed providers is included with Synapse.
 | |
|           #
 | |
|           # Uncomment the following to disable using these default oEmbed URLs.
 | |
|           # Defaults to 'false'.
 | |
|           #
 | |
|           #disable_default_providers: true
 | |
| 
 | |
|           # Additional files with oEmbed configuration (each should be in the
 | |
|           # form of providers.json).
 | |
|           #
 | |
|           # By default, this list is empty (so only the default providers.json
 | |
|           # is used).
 | |
|           #
 | |
|           #additional_providers:
 | |
|           #  - oembed/my_providers.json
 | |
|         """
 | |
| 
 | |
| 
 | |
| _OEMBED_PROVIDER_SCHEMA = {
 | |
|     "type": "array",
 | |
|     "items": {
 | |
|         "type": "object",
 | |
|         "properties": {
 | |
|             "provider_name": {"type": "string"},
 | |
|             "provider_url": {"type": "string"},
 | |
|             "endpoints": {
 | |
|                 "type": "array",
 | |
|                 "items": {
 | |
|                     "type": "object",
 | |
|                     "properties": {
 | |
|                         "schemes": {
 | |
|                             "type": "array",
 | |
|                             "items": {"type": "string"},
 | |
|                         },
 | |
|                         "url": {"type": "string"},
 | |
|                         "formats": {"type": "array", "items": {"type": "string"}},
 | |
|                         "discovery": {"type": "boolean"},
 | |
|                     },
 | |
|                     "required": ["schemes", "url"],
 | |
|                 },
 | |
|             },
 | |
|         },
 | |
|         "required": ["provider_name", "provider_url", "endpoints"],
 | |
|     },
 | |
| }
 |