Skip to content

Commit

Permalink
Migrate to OpenSearch (#16180)
Browse files Browse the repository at this point in the history
* use opensearch container locally, to match production index

* migrate to opensearchpy

* rename os -> opensearch, no need to save keystrokes

* Rename wrapper/page

* rename env var

* update docs

* remove refs to dead deps
  • Loading branch information
ewdurbin authored Jun 27, 2024
1 parent bdd7f89 commit 5f9fb17
Show file tree
Hide file tree
Showing 26 changed files with 220 additions and 212 deletions.
2 changes: 0 additions & 2 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ updates:
- dependency-type: indirect
rebase-strategy: "disabled"
ignore:
# Always ignore elasticsearch, future versions are always incompatible with our provider
- dependency-name: "elasticsearch"
# These update basically every day, and 99.9% of the time we don't care
- dependency-name: "boto3"
- dependency-name: "boto3-stubs"
Expand Down
7 changes: 7 additions & 0 deletions dev/compose/opensearch/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM opensearchproject/opensearch:2.12.0

RUN opensearch-plugin remove opensearch-skills --purge
RUN opensearch-plugin remove opensearch-ml --purge
RUN opensearch-plugin remove opensearch-neural-search --purge
RUN opensearch-plugin remove opensearch-performance-analyzer --purge
RUN opensearch-plugin remove opensearch-security-analytics --purge
2 changes: 1 addition & 1 deletion dev/environment
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ BROKER_URL=sqs://localstack:4566/?region=us-east-1&queue_name_prefix=warehouse-d

DATABASE_URL=postgresql+psycopg://postgres@db/warehouse

ELASTICSEARCH_URL=http://elasticsearch:9200/development
OPENSEARCH_URL=http://opensearch:9200/development

REDIS_URL=redis://redis:6379/0

Expand Down
2 changes: 1 addition & 1 deletion docker-compose.override.yaml-sample
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ services:
dev-docs: *disable-service
user-docs: *disable-service

elasticsearch:
opensearch:
# You can also add selective environment variables
environment:
logger.level: WARN # default INFO is pretty noisy
Expand Down
17 changes: 13 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,20 @@ services:
ports:
- "4566:4566"

elasticsearch:
image: elasticsearch:7.10.1
opensearch:
build:
context: ./dev/compose/opensearch
init: true
healthcheck:
test: ["CMD-SHELL", "curl -u admin:gqYeDIzbEwTTYmB7 --silent --fail http://localhost:9200/_cluster/health || exit 1"]
interval: 1s
start_period: 10s
environment:
- xpack.security.enabled=false
- discovery.type=single-node
- OPENSEARCH_INITIAL_ADMIN_PASSWORD=gqYeDIzbEwTTYmB7
- DISABLE_INSTALL_DEMO_CONFIG=true
- DISABLE_SECURITY_PLUGIN=true
- DISABLE_PERFORMANCE_ANALYZER_AGENT_CLI=true
ulimits:
nofile:
soft: 65536
Expand Down Expand Up @@ -109,7 +118,7 @@ services:
depends_on:
db:
condition: service_healthy
elasticsearch:
opensearch:
condition: service_started
redis:
condition: service_started
Expand Down
8 changes: 4 additions & 4 deletions docs/dev/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,12 @@ C4Container
Container(camo, "Camo", "image proxy")
Container(web_app, "Web", "Python (Pyramid, SQLAlchemy)", "Delivers HTML and API content")
SystemQueue(sqs, "AWS SQS", "task broker")
SystemDb(elasticsearch, "Elasticsearch", "Index of projects, packages, metadata")
SystemDb(opensearch, "OpenSearch", "Index of projects, packages, metadata")
SystemDb(db, "Postgres Database", "Store project, package metadata, user details")
SystemDb(redis, "Redis", "Store short-term cache data")
Rel(web_app, sqs, "queue tasks")
Rel(web_app, elasticsearch, "search for projects")
Rel(web_app, opensearch, "search for projects")
Rel(web_app, db, "store/retrieve most data")
Rel(web_app, redis, "cache data")
}
Expand Down Expand Up @@ -153,7 +153,7 @@ C4Container
Container_Boundary(c1, "Supporting Systems") {
SystemDb(redis, "Redis", "Store short-term cache data")
SystemQueue(sqs, "AWS SQS", "task broker")
SystemDb(elasticsearch, "Elasticsearch", "Index of projects, packages, metadata")
SystemDb(opensearch, "OpenSearch", "Index of projects, packages, metadata")
SystemDb(db, "Postgres Database", "Store project, package metadata, user details")
System(ses, "AWS SES", "Simple Email Service")
}
Expand All @@ -163,7 +163,7 @@ C4Container
BiRel(worker, sqs, "get next task/ack")
BiRel(worker, redis, "store task results")
BiRel(worker, db, "interact with models")
BiRel(worker, elasticsearch, "update search index")
BiRel(worker, opensearch, "update search index")
Rel(worker, fastly, "purge URLs")
Rel(worker, ses, "send emails")
Expand Down
20 changes: 10 additions & 10 deletions docs/dev/development/getting-started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,8 @@ application.
(on Windows by editing the config file found at ``C:\Users\<USER>\AppData\Local\Docker\wsl``).

If you are using Linux, you may need to configure the maximum map count to get
the `elasticsearch` up and running. According to the
`documentation <https://www.elastic.co/guide/en/elasticsearch/reference/6.8/vm-max-map-count.html>`_
the `opensearch` up and running. According to the
`documentation <https://opensearch.org/docs/2.15/install-and-configure/install-opensearch/index/#important-settings>`_
this can be set temporarily:

.. code-block:: console
Expand All @@ -200,9 +200,9 @@ application.
:file:`/etc/sysctl.conf`.

Also check that you have more than 5% disk space free, otherwise
elasticsearch will become read only. See ``flood_stage`` in the
`elasticsearch disk allocation docs
<https://www.elastic.co/guide/en/elasticsearch/reference/6.8/disk-allocator.html>`_.
opensearch will become read only. See ``flood_stage`` in the
`opensearch disk allocation docs
<https://opensearch.org/docs/latest/install-and-configure/configuring-opensearch/cluster-settings/#cluster-level-routing-and-allocation-settings>`_.


Once ``make build`` has finished, run the command:
Expand Down Expand Up @@ -414,10 +414,10 @@ Errors when executing ``make initdb``

* If ``make initdb`` fails with a timeout like::

urllib3.exceptions.ConnectTimeoutError: (<urllib3.connection.HTTPConnection object at 0x8beca733c3c8>, 'Connection to elasticsearch timed out. (connect timeout=30)')
urllib3.exceptions.ConnectTimeoutError: (<urllib3.connection.HTTPConnection object at 0x8beca733c3c8>, 'Connection to opensearch timed out. (connect timeout=30)')

you might need to increase the amount of memory allocated to docker, since
elasticsearch wants a lot of memory (Dustin gives warehouse ~4GB locally).
opensearch wants a lot of memory (Dustin gives warehouse ~4GB locally).
Refer to the tip under :ref:`running-warehouse-containers` section for more details.


Expand Down Expand Up @@ -478,7 +478,7 @@ Docker please raise an issue in
Disabling services locally
^^^^^^^^^^^^^^^^^^^^^^^^^^

Some services, such as Elasticsearch, consume a lot of resources when running
Some services, such as OpenSearch, consume a lot of resources when running
locally, but might not always be necessary when doing local development.

To disable these locally, you can create a ``docker-compose.override.yaml``
Expand All @@ -490,8 +490,8 @@ individually disable services, modify their entrypoint to do something else:
version: "3"
services:
elasticsearch:
entrypoint: ["echo", "Elasticsearch disabled"]
opensearch:
entrypoint: ["echo", "OpenSearch disabled"]
Note that disabling services might cause things to fail in unexpected ways.

Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ module = [
"b2sdk.*", # https://github.com/Backblaze/b2-sdk-python/issues/148
"celery.app.backends.*",
"celery.backends.redis.*",
"elasticsearch_dsl.*", # https://github.com/elastic/elasticsearch-dsl-py/issues/1533
"github_reserved_names.*",
"google.cloud.*",
"forcediphttpsadapter.*",
Expand Down
5 changes: 2 additions & 3 deletions requirements/main.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ click
cryptography
datadog>=0.19.0
disposable-email-domains
elasticsearch>=7.0.0,<7.11.0
elasticsearch_dsl>=7.0.0,<8.0.0
first
forcediphttpsadapter
github-reserved-names>=1.0.0
Expand All @@ -31,6 +29,7 @@ linehaul
lxml
msgpack
natsort
opensearch-py
orjson
packaging>=23.2
packaging_legacy
Expand Down Expand Up @@ -69,7 +68,7 @@ structlog
transaction
trove-classifiers
ua-parser
urllib3<2 # See https://github.com/pypi/warehouse/issues/14671
urllib3
webauthn>=1.0.0,<3.0.0
whitenoise
WTForms[email]>=2.0.0
Expand Down
26 changes: 12 additions & 14 deletions requirements/main.txt
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ certifi==2024.6.2 \
--hash=sha256:ddc6c8ce995e6987e7faf5e3f1b02b302836a0e5d98ece18392cb1a36c72ad56
# via
# -r requirements/main.in
# elasticsearch
# opensearch-py
# requests
# sentry-sdk
cffi==1.16.0 \
Expand Down Expand Up @@ -485,20 +485,13 @@ docutils==0.20.1 \
--hash=sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6 \
--hash=sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b
# via readme-renderer
elasticsearch==7.10.1 \
--hash=sha256:4ebd34fd223b31c99d9f3b6b6236d3ac18b3046191a37231e8235b06ae7db955 \
--hash=sha256:a725dd923d349ca0652cf95d6ce23d952e2153740cf4ab6daf4a2d804feeed48
# via
# -r requirements/main.in
# elasticsearch-dsl
elasticsearch-dsl==7.4.1 \
--hash=sha256:07ee9c87dc28cc3cae2daa19401e1e18a172174ad9e5ca67938f752e3902a1d5 \
--hash=sha256:97f79239a252be7c4cce554c29e64695d7ef6a4828372316a5e5ff815e7a7498
# via -r requirements/main.in
email-validator==2.2.0 \
--hash=sha256:561977c2d73ce3611850a06fa56b414621e0c8faa9d66f2611407d87465da631 \
--hash=sha256:cb690f344c617a714f22e66ae771445a1ceb46821152df8e165c5f9a364582b7
# via wtforms
events==0.5 \
--hash=sha256:a7286af378ba3e46640ac9825156c93bdba7502174dd696090fdfcd4d80a1abd
# via opensearch-py
first==2.0.2 \
--hash=sha256:8d8e46e115ea8ac652c76123c0865e3ff18372aef6f03c22809ceefcea9dec86 \
--hash=sha256:ff285b08c55f8c97ce4ea7012743af2495c9f1291785f163722bd36f6af6d3bf
Expand Down Expand Up @@ -1285,6 +1278,10 @@ openapi-spec-validator==0.7.1 \
--hash=sha256:3c81825043f24ccbcd2f4b149b11e8231abce5ba84f37065e14ec947d8f4e959 \
--hash=sha256:8577b85a8268685da6f8aa30990b83b7960d4d1117e901d451b5d572605e5ec7
# via openapi-core
opensearch-py==2.6.0 \
--hash=sha256:0b7c27e8ed84c03c99558406927b6161f186a72502ca6d0325413d8e5523ba96 \
--hash=sha256:b6e78b685dd4e9c016d7a4299cf1de69e299c88322e3f81c716e6e23fe5683c1
# via -r requirements/main.in
orjson==3.10.5 \
--hash=sha256:03b565c3b93f5d6e001db48b747d31ea3819b89abf041ee10ac6988886d18e01 \
--hash=sha256:099e81a5975237fda3100f918839af95f42f981447ba8f47adb7b6a3cdb078fa \
Expand Down Expand Up @@ -1649,8 +1646,8 @@ python-dateutil==2.9.0.post0 \
# botocore
# celery
# celery-redbeat
# elasticsearch-dsl
# google-cloud-bigquery
# opensearch-py
python-slugify==8.0.4 \
--hash=sha256:276540b79961052b66b7d116620b36518847f52d5fd9e3a70164fc8c50faa6b8 \
--hash=sha256:59202371d1d05b54a9e7720c5e038f928f45daaffe41dd10822f3907b937c856
Expand Down Expand Up @@ -1745,6 +1742,7 @@ requests==2.32.3 \
# google-cloud-bigquery
# google-cloud-storage
# jsonschema-path
# opensearch-py
# premailer
# requests-aws4auth
# stripe
Expand Down Expand Up @@ -1880,9 +1878,9 @@ six==1.16.0 \
--hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254
# via
# automat
# elasticsearch-dsl
# html5lib
# isodate
# opensearch-py
# pymacaroons
# python-dateutil
# requests-aws4auth
Expand Down Expand Up @@ -2006,8 +2004,8 @@ urllib3==1.26.19 \
# -r requirements/main.in
# botocore
# celery
# elasticsearch
# kombu
# opensearch-py
# requests
# sentry-sdk
venusian==3.1.0 \
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def app_config(database):
"database.url": database,
"docs.url": "http://docs.example.com/",
"ratelimit.url": "memory://",
"elasticsearch.url": "https://localhost/warehouse",
"opensearch.url": "https://localhost/warehouse",
"files.backend": "warehouse.packaging.services.LocalFileStorage",
"archive_files.backend": "warehouse.packaging.services.LocalArchiveFileStorage",
"simple.backend": "warehouse.packaging.services.LocalSimpleStorage",
Expand Down
46 changes: 23 additions & 23 deletions tests/unit/search/test_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import elasticsearch
import opensearchpy
import pretend

from warehouse import search
Expand Down Expand Up @@ -69,7 +69,7 @@ def test_execute_unindex_success(app_config):
assert "warehouse.search.project_deletes" not in session.info


def test_es(monkeypatch):
def test_opensearch(monkeypatch):
search_obj = pretend.stub()
index_obj = pretend.stub(
document=pretend.call_recorder(lambda d: None),
Expand All @@ -84,15 +84,15 @@ def test_es(monkeypatch):
client = pretend.stub()
request = pretend.stub(
registry={
"elasticsearch.client": client,
"elasticsearch.index": "warehouse",
"opensearch.client": client,
"opensearch.index": "warehouse",
"search.doc_types": doc_types,
}
)

es = search.es(request)
opensearch = search.opensearch(request)

assert es is search_obj
assert opensearch is search_obj
assert index_cls.calls == [pretend.call("warehouse", using=client)]
assert index_obj.document.calls == [pretend.call(d) for d in doc_types]
assert index_obj.settings.calls == [
Expand All @@ -104,20 +104,20 @@ def test_es(monkeypatch):
def test_includeme(monkeypatch):
aws4auth_stub = pretend.stub()
aws4auth = pretend.call_recorder(lambda *a, **kw: aws4auth_stub)
es_client = pretend.stub()
es_client_init = pretend.call_recorder(lambda *a, **kw: es_client)
opensearch_client = pretend.stub()
opensearch_client_init = pretend.call_recorder(lambda *a, **kw: opensearch_client)

monkeypatch.setattr(search.requests_aws4auth, "AWS4Auth", aws4auth)
monkeypatch.setattr(search.elasticsearch, "Elasticsearch", es_client_init)
monkeypatch.setattr(search.opensearchpy, "OpenSearch", opensearch_client_init)

registry = {}
es_url = "https://some.url/some-index?aws_auth=1&region=us-east-2"
opensearch_url = "https://some.url/some-index?aws_auth=1&region=us-east-2"
config = pretend.stub(
registry=pretend.stub(
settings={
"aws.key_id": "AAAAAAAAAAAA",
"aws.secret_key": "deadbeefdeadbeefdeadbeef",
"elasticsearch.url": es_url,
"opensearch.url": opensearch_url,
},
__setitem__=registry.__setitem__,
),
Expand All @@ -130,20 +130,20 @@ def test_includeme(monkeypatch):
assert aws4auth.calls == [
pretend.call("AAAAAAAAAAAA", "deadbeefdeadbeefdeadbeef", "us-east-2", "es")
]
assert len(es_client_init.calls) == 1
assert es_client_init.calls[0].kwargs["hosts"] == ["https://some.url"]
assert es_client_init.calls[0].kwargs["timeout"] == 2
assert es_client_init.calls[0].kwargs["retry_on_timeout"] is False
assert len(opensearch_client_init.calls) == 1
assert opensearch_client_init.calls[0].kwargs["hosts"] == ["https://some.url"]
assert opensearch_client_init.calls[0].kwargs["timeout"] == 2
assert opensearch_client_init.calls[0].kwargs["retry_on_timeout"] is False
assert (
es_client_init.calls[0].kwargs["connection_class"]
== elasticsearch.connection.http_requests.RequestsHttpConnection
opensearch_client_init.calls[0].kwargs["connection_class"]
== opensearchpy.connection.http_requests.RequestsHttpConnection
)
assert es_client_init.calls[0].kwargs["http_auth"] == aws4auth_stub
assert opensearch_client_init.calls[0].kwargs["http_auth"] == aws4auth_stub

assert registry["elasticsearch.client"] == es_client
assert registry["elasticsearch.index"] == "some-index"
assert registry["elasticsearch.shards"] == 1
assert registry["elasticsearch.replicas"] == 0
assert registry["opensearch.client"] == opensearch_client
assert registry["opensearch.index"] == "some-index"
assert registry["opensearch.shards"] == 1
assert registry["opensearch.replicas"] == 0
assert config.add_request_method.calls == [
pretend.call(search.es, name="es", reify=True)
pretend.call(search.opensearch, name="opensearch", reify=True)
]
Loading

0 comments on commit 5f9fb17

Please sign in to comment.