diff --git a/.gitignore b/.gitignore
index 6342047..dd2c92d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -169,4 +169,11 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
+
+# Local agent/editor state
+.continue/
+temp/
+.envs/.local/.django
+start-dev.sh
+opencode.json
diff --git a/.pylintrc b/.pylintrc
deleted file mode 100644
index 55509fe..0000000
--- a/.pylintrc
+++ /dev/null
@@ -1,14 +0,0 @@
-[MASTER]
-load-plugins=pylint_django, pylint_celery
-django-settings-module=config.settings.base
-[FORMAT]
-max-line-length=120
-
-[MESSAGES CONTROL]
-disable=missing-docstring,invalid-name
-
-[DESIGN]
-max-parents=13
-
-[TYPECHECK]
-generated-members=REQUEST,acl_users,aq_parent,"[a-zA-Z]+_set{1,2}",save,delete
diff --git a/.readthedocs.yml b/.readthedocs.yml
deleted file mode 100644
index b4cf0c0..0000000
--- a/.readthedocs.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-version: 2
-
-sphinx:
- configuration: docs/conf.py
-
-build:
- image: testing
-
-python:
- version: 3.9
- install:
- - requirements: requirements/local.txt
diff --git a/COPYING b/COPYING
deleted file mode 100644
index 94a9ed0..0000000
--- a/COPYING
+++ /dev/null
@@ -1,674 +0,0 @@
- GNU GENERAL PUBLIC LICENSE
- Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc.
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
- Preamble
-
- The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
- The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works. By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users. We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors. You can apply it to
-your programs, too.
-
- When we speak of free software, we are referring to freedom, not
-price. Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
- To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights. Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
- For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received. You must make sure that they, too, receive
-or can get the source code. And you must show them these terms so they
-know their rights.
-
- Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
- For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software. For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
- Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so. This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software. The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable. Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products. If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
- Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary. To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
- The precise terms and conditions for copying, distribution and
-modification follow.
-
- TERMS AND CONDITIONS
-
- 0. Definitions.
-
- "This License" refers to version 3 of the GNU General Public License.
-
- "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
- "The Program" refers to any copyrightable work licensed under this
-License. Each licensee is addressed as "you". "Licensees" and
-"recipients" may be individuals or organizations.
-
- To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy. The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
- A "covered work" means either the unmodified Program or a work based
-on the Program.
-
- To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy. Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
- To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies. Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
- An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License. If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
- 1. Source Code.
-
- The "source code" for a work means the preferred form of the work
-for making modifications to it. "Object code" means any non-source
-form of a work.
-
- A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
- The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form. A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
- The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities. However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work. For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
- The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
- The Corresponding Source for a work in source code form is that
-same work.
-
- 2. Basic Permissions.
-
- All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met. This License explicitly affirms your unlimited
-permission to run the unmodified Program. The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work. This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
- You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force. You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright. Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
- Conveying under any other circumstances is permitted solely under
-the conditions stated below. Sublicensing is not allowed; section 10
-makes it unnecessary.
-
- 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
- No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
- When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
- 4. Conveying Verbatim Copies.
-
- You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
- You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
- 5. Conveying Modified Source Versions.
-
- You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
- a) The work must carry prominent notices stating that you modified
- it, and giving a relevant date.
-
- b) The work must carry prominent notices stating that it is
- released under this License and any conditions added under section
- 7. This requirement modifies the requirement in section 4 to
- "keep intact all notices".
-
- c) You must license the entire work, as a whole, under this
- License to anyone who comes into possession of a copy. This
- License will therefore apply, along with any applicable section 7
- additional terms, to the whole of the work, and all its parts,
- regardless of how they are packaged. This License gives no
- permission to license the work in any other way, but it does not
- invalidate such permission if you have separately received it.
-
- d) If the work has interactive user interfaces, each must display
- Appropriate Legal Notices; however, if the Program has interactive
- interfaces that do not display Appropriate Legal Notices, your
- work need not make them do so.
-
- A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit. Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
- 6. Conveying Non-Source Forms.
-
- You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
- a) Convey the object code in, or embodied in, a physical product
- (including a physical distribution medium), accompanied by the
- Corresponding Source fixed on a durable physical medium
- customarily used for software interchange.
-
- b) Convey the object code in, or embodied in, a physical product
- (including a physical distribution medium), accompanied by a
- written offer, valid for at least three years and valid for as
- long as you offer spare parts or customer support for that product
- model, to give anyone who possesses the object code either (1) a
- copy of the Corresponding Source for all the software in the
- product that is covered by this License, on a durable physical
- medium customarily used for software interchange, for a price no
- more than your reasonable cost of physically performing this
- conveying of source, or (2) access to copy the
- Corresponding Source from a network server at no charge.
-
- c) Convey individual copies of the object code with a copy of the
- written offer to provide the Corresponding Source. This
- alternative is allowed only occasionally and noncommercially, and
- only if you received the object code with such an offer, in accord
- with subsection 6b.
-
- d) Convey the object code by offering access from a designated
- place (gratis or for a charge), and offer equivalent access to the
- Corresponding Source in the same way through the same place at no
- further charge. You need not require recipients to copy the
- Corresponding Source along with the object code. If the place to
- copy the object code is a network server, the Corresponding Source
- may be on a different server (operated by you or a third party)
- that supports equivalent copying facilities, provided you maintain
- clear directions next to the object code saying where to find the
- Corresponding Source. Regardless of what server hosts the
- Corresponding Source, you remain obligated to ensure that it is
- available for as long as needed to satisfy these requirements.
-
- e) Convey the object code using peer-to-peer transmission, provided
- you inform other peers where the object code and Corresponding
- Source of the work are being offered to the general public at no
- charge under subsection 6d.
-
- A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
- A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling. In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage. For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product. A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
- "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source. The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
- If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information. But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
- The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed. Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
- Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
- 7. Additional Terms.
-
- "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law. If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
- When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it. (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.) You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
- Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
- a) Disclaiming warranty or limiting liability differently from the
- terms of sections 15 and 16 of this License; or
-
- b) Requiring preservation of specified reasonable legal notices or
- author attributions in that material or in the Appropriate Legal
- Notices displayed by works containing it; or
-
- c) Prohibiting misrepresentation of the origin of that material, or
- requiring that modified versions of such material be marked in
- reasonable ways as different from the original version; or
-
- d) Limiting the use for publicity purposes of names of licensors or
- authors of the material; or
-
- e) Declining to grant rights under trademark law for use of some
- trade names, trademarks, or service marks; or
-
- f) Requiring indemnification of licensors and authors of that
- material by anyone who conveys the material (or modified versions of
- it) with contractual assumptions of liability to the recipient, for
- any liability that these contractual assumptions directly impose on
- those licensors and authors.
-
- All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10. If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term. If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
- If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
- Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
- 8. Termination.
-
- You may not propagate or modify a covered work except as expressly
-provided under this License. Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
- However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
- Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
- Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License. If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
- 9. Acceptance Not Required for Having Copies.
-
- You are not required to accept this License in order to receive or
-run a copy of the Program. Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance. However,
-nothing other than this License grants you permission to propagate or
-modify any covered work. These actions infringe copyright if you do
-not accept this License. Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
- 10. Automatic Licensing of Downstream Recipients.
-
- Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License. You are not responsible
-for enforcing compliance by third parties with this License.
-
- An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations. If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
- You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License. For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
- 11. Patents.
-
- A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based. The
-work thus licensed is called the contributor's "contributor version".
-
- A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version. For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
- Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
- In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement). To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
- If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients. "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
- If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
- A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License. You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
- Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
- 12. No Surrender of Others' Freedom.
-
- If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License. If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all. For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
- 13. Use with the GNU Affero General Public License.
-
- Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work. The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
- 14. Revised Versions of this License.
-
- The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time. Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
- Each version is given a distinguishing version number. If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation. If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
- If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
- Later license versions may give you additional or different
-permissions. However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
- 15. Disclaimer of Warranty.
-
- THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
- 16. Limitation of Liability.
-
- IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
- 17. Interpretation of Sections 15 and 16.
-
- If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
- END OF TERMS AND CONDITIONS
-
- How to Apply These Terms to Your New Programs
-
- If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
- To do so, attach the following notices to the program. It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-
- Copyright (C)
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
-
-Also add information on how to contact you by electronic and paper mail.
-
- If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
- Copyright (C)
- This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
- This is free software, and you are welcome to redistribute it
- under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License. Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
- You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-.
-
- The GNU General Public License does not permit incorporating your program
-into proprietary programs. If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library. If this is what you want to do, use the GNU Lesser General
-Public License instead of this License. But first, please read
-.
diff --git a/article/admin.py b/article/admin.py
deleted file mode 100644
index 8c38f3f..0000000
--- a/article/admin.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.contrib import admin
-
-# Register your models here.
diff --git a/article/management/commands/__init__.py b/article/management/commands/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/article/management/commands/load_articles_by_year.py b/article/management/commands/load_articles_by_year.py
deleted file mode 100644
index 335598e..0000000
--- a/article/management/commands/load_articles_by_year.py
+++ /dev/null
@@ -1,80 +0,0 @@
-from django.core.management.base import BaseCommand
-
-from article.tasks import task_load_article_from_opac, task_load_article_from_article_meta
-
-
-class Command(BaseCommand):
- help = 'Generate task requests for loading article data from Article Meta for each year from 1900 to 2025'
-
- def add_arguments(self, parser):
- parser.add_argument(
- '--start-year',
- type=int,
- default=1990,
- help='Start year (default: 1990)'
- )
- parser.add_argument(
- '--end-year',
- type=int,
- default=2025,
- help='End year (default: 2025)'
- )
- parser.add_argument(
- '--collection',
- type=str,
- default='scl',
- help='Collection code (default: scl)'
- )
- parser.add_argument(
- '--task',
- choices=['load_article_from_opac', 'load_article_from_article_meta'],
- default='load_article_from_opac',
- help='Task to execute (default: load_article_from_opac)',
- )
-
- def handle(self, *args, **options):
- start_year = options['start_year']
- end_year = options['end_year']
- collection = options['collection']
-
- self.stdout.write(
- self.style.SUCCESS(
- f'Generating task requests from {start_year} to {end_year} for collection: {collection}'
- )
- )
-
- total_tasks = 0
-
- for year in range(start_year, end_year + 1):
- from_date = f'{year}-01-01'
- until_date = f'{year}-12-31'
-
- self.stdout.write(f'Queuing task for year {year}...')
-
- # Queue the task for each year
- if options['task'] == 'load_article_from_article_meta':
- task_result = task_load_article_from_article_meta.delay(
- from_date=from_date,
- until_date=until_date,
- collection=collection
- )
- else:
- task_result = task_load_article_from_opac.delay(
- from_date=from_date,
- until_date=until_date,
- collection=collection
- )
-
- total_tasks += 1
-
- self.stdout.write(
- self.style.SUCCESS(
- f'✓ Task queued for year {year}: {from_date} to {until_date} (Task ID: {task_result.id})'
- )
- )
-
- self.stdout.write(
- self.style.SUCCESS(
- f'\nCompleted! {total_tasks} tasks have been queued successfully.'
- )
- )
diff --git a/article/migrations/0001_initial.py b/article/migrations/0001_initial.py
deleted file mode 100644
index 816d61e..0000000
--- a/article/migrations/0001_initial.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Generated by Django 5.0.7 on 2025-02-07 17:50
-
-import django.db.models.deletion
-from django.conf import settings
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- initial = True
-
- dependencies = [
- ("collection", "0001_initial"),
- migrations.swappable_dependency(settings.AUTH_USER_MODEL),
- ]
-
- operations = [
- migrations.CreateModel(
- name="Article",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
- ),
- (
- "updated",
- models.DateTimeField(
- auto_now=True, verbose_name="Last update date"
- ),
- ),
- (
- "scielo_issn",
- models.CharField(
- db_index=True, max_length=9, verbose_name="SciELO ISSN"
- ),
- ),
- (
- "pid_v2",
- models.CharField(
- db_index=True, max_length=23, verbose_name="PID V2"
- ),
- ),
- (
- "pid_v3",
- models.CharField(
- blank=True,
- db_index=True,
- max_length=23,
- null=True,
- verbose_name="PID V3",
- ),
- ),
- (
- "pdfs",
- models.JSONField(
- blank=True,
- default=dict,
- null=True,
- verbose_name="Format with Language",
- ),
- ),
- (
- "default_lang",
- models.CharField(max_length=2, verbose_name="Default Language"),
- ),
- (
- "text_langs",
- models.JSONField(
- blank=True,
- default=dict,
- null=True,
- verbose_name="Text Languages",
- ),
- ),
- (
- "processing_date",
- models.CharField(max_length=32, verbose_name="Processing Date"),
- ),
- (
- "publication_date",
- models.CharField(max_length=32, verbose_name="Publication Date"),
- ),
- (
- "publication_year",
- models.CharField(
- db_index=True, max_length=4, verbose_name="Publication Year"
- ),
- ),
- (
- "collection",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="collection.collection",
- verbose_name="Collection",
- ),
- ),
- (
- "creator",
- models.ForeignKey(
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_creator",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Creator",
- ),
- ),
- (
- "updated_by",
- models.ForeignKey(
- blank=True,
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_last_mod_user",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Updater",
- ),
- ),
- ],
- options={
- "verbose_name": "Article",
- "verbose_name_plural": "Articles",
- "unique_together": {("collection", "scielo_issn", "pid_v2", "pid_v3")},
- },
- ),
- ]
diff --git a/article/migrations/0002_alter_article_unique_together_article_files_and_more.py b/article/migrations/0002_alter_article_unique_together_article_files_and_more.py
deleted file mode 100644
index cee055c..0000000
--- a/article/migrations/0002_alter_article_unique_together_article_files_and_more.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Generated by Django 5.0.7 on 2025-04-01 01:09
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("article", "0001_initial"),
- ("collection", "0001_initial"),
- ]
-
- operations = [
- migrations.AddField(
- model_name="article",
- name="files",
- field=models.JSONField(
- blank=True, default=dict, null=True, verbose_name="Files"
- ),
- ),
- migrations.AddField(
- model_name="article",
- name="pid_generic",
- field=models.CharField(
- blank=True,
- db_index=True,
- max_length=50,
- null=True,
- verbose_name="PID Generic",
- ),
- ),
- migrations.RemoveField(
- model_name="article",
- name="pdfs",
- ),
- migrations.AlterUniqueTogether(
- name="article",
- unique_together={
- ("collection", "scielo_issn", "pid_v2", "pid_v3", "pid_generic")
- },
- ),
-
- ]
diff --git a/article/migrations/0003_article_collection_scielo_issn_idx.py b/article/migrations/0003_article_collection_scielo_issn_idx.py
deleted file mode 100644
index 753ac98..0000000
--- a/article/migrations/0003_article_collection_scielo_issn_idx.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Generated by Django 5.0.7 on 2025-06-12 17:16
-
-from django.conf import settings
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("article", "0002_alter_article_unique_together_article_files_and_more"),
- ("collection", "0001_initial"),
- migrations.swappable_dependency(settings.AUTH_USER_MODEL),
- ]
-
- operations = [
- migrations.AddIndex(
- model_name="article",
- index=models.Index(
- fields=["collection", "scielo_issn"], name="collection_scielo_issn_idx"
- ),
- ),
- ]
diff --git a/article/migrations/__init__.py b/article/migrations/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/article/models.py b/article/models.py
deleted file mode 100644
index 80d2a97..0000000
--- a/article/models.py
+++ /dev/null
@@ -1,143 +0,0 @@
-from django.db import models
-from django.utils.translation import gettext_lazy as _
-
-from core.models import CommonControlField
-from collection.models import Collection
-
-
-class Article(CommonControlField):
- collection = models.ForeignKey(
- Collection,
- verbose_name=_('Collection'),
- on_delete=models.CASCADE,
- blank=False,
- null=False,
- db_index=True,
- )
-
- scielo_issn = models.CharField(
- verbose_name=_('SciELO ISSN'),
- max_length=9,
- blank=False,
- null=False,
- db_index=True,
- )
-
- pid_v2 = models.CharField(
- verbose_name=_('PID V2'),
- max_length=23,
- blank=False,
- null=False,
- db_index=True,
- )
-
- pid_v3 = models.CharField(
- verbose_name=_('PID V3'),
- max_length=23,
- blank=True,
- null=True,
- db_index=True,
- )
-
- pid_generic = models.CharField(
- verbose_name=_('PID Generic'),
- max_length=50,
- blank=True,
- null=True,
- db_index=True,
- )
-
- files = models.JSONField(
- verbose_name=_('Files'),
- null=True,
- blank=True,
- default=dict,
- )
-
- default_lang = models.CharField(
- verbose_name=_('Default Language'),
- max_length=2,
- blank=False,
- null=False,
- )
-
- text_langs = models.JSONField(
- verbose_name=_('Text Languages'),
- null=True,
- blank=True,
- default=dict,
- )
-
- processing_date = models.CharField(
- verbose_name=_('Processing Date'),
- max_length=32,
- null=False,
- blank=False,
- )
-
- publication_date = models.CharField(
- verbose_name=_('Publication Date'),
- max_length=32,
- null=False,
- blank=False,
- )
-
- publication_year = models.CharField(
- verbose_name=_('Publication Year'),
- max_length=4,
- null=False,
- blank=False,
- db_index=True,
- )
-
- def __str__(self):
- return f'{self.collection.acron3} - {self.scielo_issn} - {self.pid_v2 or self.pid_v3 or self.pid_generic}'
-
- @classmethod
- def metadata(cls, collection=None):
- qs = cls.objects.select_related('collection').only(
- 'collection__acron3',
- 'default_lang',
- 'files',
- 'pid_v2',
- 'pid_v3',
- 'pid_generic',
- 'processing_date',
- 'publication_date',
- 'publication_year',
- 'scielo_issn',
- 'text_langs',
- )
-
- if collection:
- qs = qs.filter(collection=collection)
-
- for a in qs.iterator():
- yield {
- 'collection': a.collection.acron3,
- 'default_lang': a.default_lang,
- 'files': a.files,
- 'pid_v2': a.pid_v2,
- 'pid_v3': a.pid_v3,
- 'pid_generic': a.pid_generic,
- 'processing_date': a.processing_date,
- 'publication_date': a.publication_date,
- 'publication_year': a.publication_year,
- 'scielo_issn': a.scielo_issn,
- 'text_langs': a.text_langs,
- }
-
- class Meta:
- verbose_name = _('Article')
- verbose_name_plural = _('Articles')
- unique_together = (
- 'collection',
- 'scielo_issn',
- 'pid_v2',
- 'pid_v3',
- 'pid_generic',
- )
- indexes = [
- models.Index(fields=['collection', 'scielo_issn'], name='collection_scielo_issn_idx'),
- ]
-
diff --git a/article/tasks.py b/article/tasks.py
deleted file mode 100644
index 3514fca..0000000
--- a/article/tasks.py
+++ /dev/null
@@ -1,259 +0,0 @@
-import logging
-
-from django.contrib.auth import get_user_model
-from django.db.models import Q
-from django.db import DataError
-from django.utils.translation import gettext as _
-
-from collection.models import Collection
-from config import celery_app
-from core.utils import date_utils
-from core.utils.utils import _get_user
-
-from journal.models import Journal
-
-from tracker.models import ArticleEvent
-from tracker.choices import ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, ARTICLE_EVENT_TYPE_DATA_ERROR
-
-from . import models, utils
-
-
-User = get_user_model()
-
-@celery_app.task(bind=True, name=_('Load article data from Article Meta'), timelimit=-1, queue='load')
-def task_load_article_from_article_meta(self, from_date=None, until_date=None, days_to_go_back=None, collection=None, issn=None, force_update=True, user_id=None, username=None):
- user = _get_user(self.request, username=username, user_id=user_id)
-
- from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back)
- logging.info(f'Loading articles from Article Meta. From: {from_date}, Until: {until_date}, Collection: {collection}, ISSN: {issn}.')
-
- offset = 0
- limit = 1000
- while True:
- logging.info(f'{from_date}, {until_date}, {offset}, {limit}, {collection}, {issn}')
- response = utils.fetch_article_meta_dict(from_date, until_date, offset=offset, limit=limit, collection=collection, issn=issn)
- objects = response.get('objects')
- if not objects:
- break
-
- for obj in objects:
- codes = obj.get('code_title')
-
- for issn_code in codes:
- jou = Journal.objects.filter(
- Q(issns__electronic_issn=issn_code) |
- Q(issns__scielo_issn=issn_code) |
- Q(issns__print_issn=issn_code)
- ).first()
- if not jou:
- continue
-
- if not jou:
- logging.info(f'Journal not found for ISSNs: {codes}')
- continue
-
- col_obj = Collection.objects.get(acron3=obj.get('collection'))
- if not col_obj:
- logging.info(f'Collection not found: {obj.get("collection")}')
- continue
-
- try:
- article, created = models.Article.objects.get_or_create(collection=col_obj, scielo_issn=jou.scielo_issn, pid_v2=obj.get('code'))
- if created or force_update:
- article.files = obj.get('pdfs') or {}
- article.processing_date = obj.get('processing_date') or ''
- article.publication_date = obj.get('publication_date') or ''
- article.publication_year = obj.get('publication_year') or ''
- article.default_lang = obj.get('default_language') or ''
- article.text_langs = obj.get('text_langs') or ''
-
- article.save()
- logging.info(f'Article {"created" if created else "updated"}: {article}')
- except models.Article.MultipleObjectsReturned as e:
- logging.error(f'Error getting Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED,
- message=f'Error getting Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}',
- data=obj
- )
- continue
- except DataError as e:
- logging.error(f'Error saving Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_DATA_ERROR,
- message=f'Error saving Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}',
- data=obj
- )
- continue
-
- offset += limit
-
- return True
-
-
-@celery_app.task(bind=True, name=_('Load article data from OPAC'), timelimit=-1, queue='load')
-def task_load_article_from_opac(self, collection='scl', from_date=None, until_date=None, days_to_go_back=None, page=1, force_update=True, user_id=None, username=None):
- user = _get_user(self.request, username=username, user_id=user_id)
-
- from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back)
- logging.info(f'Loading articles from OPAC. From: {from_date}, Until: {until_date}')
-
- while True:
- response = utils.fetch_opac_dict(from_date, until_date, page=page)
-
- documents = response.get('documents')
-
- for doc_id, doc in documents.items():
- col_obj = Collection.objects.get(acron3=collection)
- if not col_obj:
- logging.error(f'Collection not found: {collection}')
- continue
-
- journal = Journal.objects.get(collection=col_obj, acronym=doc.get('journal_acronym'))
- if not journal:
- logging.error(f'Journal not found: {doc.get("journal_acronym")}')
- continue
-
- try:
- article, created = models.Article.objects.get_or_create(collection=col_obj, scielo_issn=journal.scielo_issn, pid_v2=doc.get('pid_v2'))
-
- if created or force_update:
- article.pid_v3 = doc.get('pid_v3') or ''
- if not created:
- article.pid_v2 = doc.get('pid_v2') or ''
- article.publication_date = doc.get('publication_date') or article.publication_date or ''
- article.default_lang = doc.get('default_language') or article.default_lang or ''
-
- try:
- article.publication_year = article.publication_date[:4]
- except IndexError:
- article.publication_year = ''
-
- article.save()
- logging.info(f'Article {"created" if created else "updated"}: {article}')
-
- except models.Article.MultipleObjectsReturned as e:
- logging.error(f'Error getting Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED,
- message=f'Error creating Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}',
- data=doc
- )
- continue
- except DataError as e:
- logging.error(f'Error saving Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_DATA_ERROR,
- message=f'Error saving Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}',
- data=doc
- )
- continue
-
- page += 1
- if page > int(response.get('pages', 0)):
- break
-
- return True
-
-
-@celery_app.task(bind=True, name=_('Load preprint data from SciELO Preprints'), timelimit=-1, queue='load')
-def task_load_preprints_from_preprints_api(self, from_date=None, until_date=None, days_to_go_back=None, force_update=True, user_id=None, username=None):
- user = _get_user(self.request, username=username, user_id=user_id)
-
- from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back)
- logging.info(f'Loading preprints from SciELO Preprints. From: {from_date}, Until: {until_date}')
-
- col_obj = Collection.objects.get(acron3='preprints')
- if not col_obj:
- logging.error(f'Collection not found: preprints')
- return False
-
- for record in utils.fetch_preprint_oai_pmh(from_date, until_date):
- data = utils.extract_preprint_data(record)
-
- if not data.get('pid_generic'):
- logging.error(f'Preprint ID not found in record: {record}')
- continue
-
- try:
- article, created = models.Article.objects.get_or_create(collection=col_obj, pid_generic=data['pid_generic'])
- if created or force_update:
- article.text_langs = data.get('text_langs')
- article.default_lang = data.get('default_language')
- article.publication_date = data.get('publication_date')
- article.publication_year = data.get('publication_year')
-
- # Preprints do not have a scielo_issn yet
- article.scielo_issn = '0000-0000'
-
- article.save()
- logging.debug(f'Article {"created" if created else "updated"}: {article}')
- except models.Article.MultipleObjectsReturned as e:
- logging.error(f'Error creating Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED,
- message=f'Error creating Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}',
- data=data
- )
- continue
- except DataError as e:
- logging.error(f'Error saving Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_DATA_ERROR,
- message=f'Error saving Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}',
- data=data
- )
- continue
-
-
-@celery_app.task(bind=True, name=_('Load dataset metadata from Dataverse'), timelimit=-1, queue='load')
-def task_load_dataset_metadata_from_dataverse(self, from_date=None, until_date=None, days_to_go_back=None, force_update=True, user_id=None, username=None):
- user = _get_user(self.request, username=username, user_id=user_id)
-
- from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back)
- logging.info(f'Loading dataset metadata from SciELO Data. From: {from_date}, Until: {until_date}')
-
- col_obj = Collection.objects.get(acron3='data')
- if not col_obj:
- logging.error(f'Collection not found: data')
- return False
-
- for record in utils.fetch_dataverse_metadata(from_date, until_date):
- dataset_doi = record.get('dataset_doi')
- if not dataset_doi:
- logging.error(f'Dataset DOI not found in record: {record}')
- continue
-
- try:
- dataset, created = models.Article.objects.get_or_create(collection=col_obj, pid_generic=dataset_doi)
- if created or force_update:
- dataset.publication_date = record.get('dataset_published')
-
- file_persistent_id = record.get('file_persistent_id')
- file_id = record.get('file_id')
- file_name = record.get('file_name')
- file_url = record.get('file_url')
-
- if file_id:
- dataset.files[file_id] = {'name': file_name, 'url': file_url, 'file_persisent_id': file_persistent_id}
-
- dataset.save()
- logging.debug(f'Dataset {"created" if created else "updated"}: {dataset}')
- except models.Article.MultipleObjectsReturned as e:
- logging.error(f'Error creating Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED,
- message=f'Error creating Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}',
- data=record
- )
- continue
- except DataError as e:
- logging.error(f'Error saving Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_DATA_ERROR,
- message=f'Error saving Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}',
- data=record
- )
- continue
-
- return True
diff --git a/article/tests.py b/article/tests.py
deleted file mode 100644
index 7ce503c..0000000
--- a/article/tests.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.test import TestCase
-
-# Create your tests here.
diff --git a/article/utils.py b/article/utils.py
deleted file mode 100644
index b9a094e..0000000
--- a/article/utils.py
+++ /dev/null
@@ -1,204 +0,0 @@
-import logging
-import requests
-import os
-
-from sickle import Sickle
-from time import sleep
-
-from core.utils import standardizer
-
-
-ARTICLEMETA_ENDPOINT = os.environ.get('ARTICLEMETA_COLLECT_URL', 'http://articlemeta.scielo.org/api/v1/article/counter_dict')
-ARTICLEMETA_MAX_RETRIES = int(os.environ.get('ARTICLEMETA_MAX_RETRIES', 5))
-ARTICLEMETA_SLEEP_TIME = int(os.environ.get('ARTICLEMETA_SLEEP_TIME', 30))
-
-OPAC_ENDPOINT = os.environ.get('OPAC_ENDPOINT', 'https://www.scielo.br/api/v1/counter_dict')
-OPAC_MAX_RETRIES = int(os.environ.get('OPAC_MAX_RETRIES', 5))
-OPAC_SLEEP_TIME = int(os.environ.get('OPAC_SLEEP_TIME', 30))
-
-OAI_PMH_PREPRINT_ENDPOINT = os.environ.get('OAI_PMH_PREPRINT_ENDPOINT', 'https://preprints.scielo.org/index.php/scielo/oai')
-OAI_METADATA_PREFIX = os.environ.get('OAI_METADATA_PREFIX', 'oai_dc')
-OAI_PMH_MAX_RETRIES = int(os.environ.get('OAI_PMH_MAX_RETRIES', 5))
-
-DATAVERSE_ENDPOINT = os.environ.get('DATAVERSE_ENDPOINT', 'https://data.scielo.org/api')
-DATAVERSE_ROOT_COLLECTION = os.environ.get('DATAVERSE_ROOT_COLLECTION', 'scielodata')
-DATAVERSE_MAX_RETRIES = int(os.environ.get('DATAVERSE_MAX_RETRIES', 5))
-DATAVERSE_SLEEP_TIME = int(os.environ.get('DATAVERSE_SLEEP_TIME', 30))
-
-
-def fetch_article_meta_dict(from_date, until_date, offset=0, limit=1000, collection=None, issn=None):
- for t in range(1, ARTICLEMETA_MAX_RETRIES + 1):
- params = {
- 'from': from_date,
- 'until': until_date,
- 'offset': offset,
- 'limit': limit
- }
-
- if collection:
- params['collection'] = collection
-
- if issn:
- params['issn'] = issn
-
- response = requests.get(ARTICLEMETA_ENDPOINT, params=params)
-
- try:
- response.raise_for_status()
- logging.info(response.url)
-
- except requests.exceptions.HTTPError:
- logging.warning(
- 'Failed to collect data from %s. Waiting %d seconds before retry %d of %d' % (
- response.url,
- ARTICLEMETA_SLEEP_TIME,
- t,
- ARTICLEMETA_MAX_RETRIES
- )
- )
- sleep(ARTICLEMETA_SLEEP_TIME)
-
- else:
- return response.json()
-
-
-def fetch_opac_dict(from_date, until_date, page=1):
- for t in range(1, OPAC_MAX_RETRIES + 1):
- params = {
- 'begin_date': from_date,
- 'end_date': until_date,
- 'page': page
- }
-
- response = requests.get(url=OPAC_ENDPOINT, params=params, verify=False)
-
- try:
- response.raise_for_status()
- logging.info(response.url)
-
- except requests.exceptions.HTTPError:
- logging.warning('Não foi possível coletar dados de %s. Aguardando %d segundos para tentativa %d de %d' % (response.url, OPAC_SLEEP_TIME, t, OPAC_MAX_RETRIES))
- sleep(OPAC_SLEEP_TIME)
-
- else:
- return response.json()
-
-
-def fetch_preprint_oai_pmh(from_date, until_date):
- oai_client = Sickle(endpoint=OAI_PMH_PREPRINT_ENDPOINT, max_retries=OAI_PMH_MAX_RETRIES, verify=False)
- records = oai_client.ListRecords(**{
- 'metadataPrefix': OAI_METADATA_PREFIX,
- 'from': from_date,
- 'until': until_date,
- })
-
- for r in records:
- yield r
-
-
-def extract_preprint_data(record):
- pid_generic = _extract_preprint_compatible_identifer(record.header.identifier)
- text_langs = [standardizer.standardize_language_code(l) for l in record.metadata.get('language', [])]
- publication_date = record.metadata.get('date', [''])[0]
- default_language = text_langs[0] if text_langs else ''
- publication_year = _extract_preprint_publication_year_from_date(publication_date)
-
- data = {
- 'pid_generic': pid_generic,
- 'text_langs': text_langs,
- 'publication_date': publication_date,
- 'default_language': default_language,
- 'publication_year': publication_year
- }
-
- return data
-
-
-def _extract_preprint_compatible_identifer(pid_v2):
- try:
- # piv_v2 should be something like oai:ops.preprints.scielo.org:preprint/1195
- # we are using the last part of the string as the identifier
- return pid_v2.split(':')[-1].split('/')[1]
- except IndexError:
- return ''
-
-
-def _extract_preprint_publication_year_from_date(date_str):
- try:
- return date_str[:4]
- except IndexError:
- return ''
-
-
-def fetch_dataverse_metadata(from_date=None, until_date=None):
- def get_subdataverses():
- url = f"{DATAVERSE_ENDPOINT}/dataverses/{DATAVERSE_ROOT_COLLECTION}/contents"
- try:
- response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME)
- response.raise_for_status()
- return response.json().get("data", [])
- except requests.exceptions.RequestException as e:
- logging.error(f"Error fetching subdataverses: {e}")
- return []
-
- def get_datasets(subdataverse_id):
- url = f"{DATAVERSE_ENDPOINT}/dataverses/{subdataverse_id}/contents"
- try:
- response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME)
- response.raise_for_status()
- return response.json().get("data", [])
- except requests.exceptions.RequestException as e:
- logging.error(f"Error fetching datasets for subdataverse {subdataverse_id}: {e}")
- return []
-
- def get_files(dataset_id):
- url = f"{DATAVERSE_ENDPOINT}/datasets/{dataset_id}/versions/:latest/files"
- try:
- response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME)
- response.raise_for_status()
- return response.json().get("data", [])
- except requests.exceptions.RequestException as e:
- logging.error(f"Error fetching files for dataset {dataset_id}: {e}")
- return []
-
- subdataverses = get_subdataverses()
-
- for subdataverse in subdataverses:
- if subdataverse["type"] != "dataverse":
- continue
-
- subdataverse_id = subdataverse["id"]
- subdataverse_title = subdataverse["title"]
- datasets = get_datasets(subdataverse_id)
-
- for dataset in datasets:
- if dataset["type"] != "dataset":
- continue
-
- dataset_id = dataset["id"]
- doi = standardizer.standardize_doi(dataset.get("persistentUrl"))
- if not doi:
- logging.warning(f"Dataset {dataset_id} does not have a DOI.")
- continue
-
- publication_date = dataset.get("publicationDate", None)
-
- if publication_date:
- if (from_date and publication_date < from_date) or (until_date and publication_date > until_date):
- continue
-
- files = get_files(dataset_id)
-
- for file in files:
- file_persistent_id = file["dataFile"].get("persistentId", None)
- file_persistent_id_stz = standardizer.standardize_pid_generic(file_persistent_id) if file_persistent_id else None
-
- yield {
- "title": subdataverse_title,
- "dataset_doi": doi,
- "dataset_published": publication_date,
- "file_id": file["dataFile"]["id"],
- "file_name": file["label"],
- "file_url": f"{DATAVERSE_ENDPOINT}/access/datafile/{file['dataFile']['id']}",
- "file_persistent_id": file_persistent_id_stz,
- }
diff --git a/article/views.py b/article/views.py
deleted file mode 100644
index 91ea44a..0000000
--- a/article/views.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.shortcuts import render
-
-# Create your views here.
diff --git a/collection/tasks.py b/collection/tasks.py
index 02fd0e7..19372de 100644
--- a/collection/tasks.py
+++ b/collection/tasks.py
@@ -1,14 +1,14 @@
from django.contrib.auth import get_user_model
from django.utils.translation import gettext as _
-from core.utils.utils import _get_user
+from core.utils.request_utils import _get_user
from collection.models import Collection
from config import celery_app
User = get_user_model()
-@celery_app.task(bind=True, name=_('Load collection data'))
+@celery_app.task(bind=True, name=_('[Collection] Load Collection Data'))
def task_load_collections(self, user_id=None, username=None):
user = _get_user(self.request, username=username, user_id=user_id)
Collection.load(user)
diff --git a/collection/wagtail_hooks.py b/collection/wagtail_hooks.py
index e7b7e97..52b31a8 100644
--- a/collection/wagtail_hooks.py
+++ b/collection/wagtail_hooks.py
@@ -1,8 +1,5 @@
from django.utils.translation import gettext as _
from wagtail.snippets.views.snippets import SnippetViewSet
-from wagtail.snippets.models import register_snippet
-
-from config.menu import get_menu_order
from .models import Collection
@@ -10,10 +7,8 @@
class CollectionSnippetViewSet(SnippetViewSet):
model = Collection
icon = "folder-open-inverse"
- menu_name = 'collection'
menu_label = _("Collection")
- menu_order = get_menu_order("collection")
- add_to_admin_menu = True
+ menu_order = 100
list_display = (
"main_name",
@@ -57,6 +52,3 @@ class CollectionSnippetViewSet(SnippetViewSet):
"updated_by",
)
export_filename = "collections"
-
-
-register_snippet(CollectionSnippetViewSet)
diff --git a/config/menu.py b/config/menu.py
index 13371c6..844ce0c 100644
--- a/config/menu.py
+++ b/config/menu.py
@@ -1,13 +1,10 @@
WAGTAIL_MENU_APPS_ORDER = {
- "collection": 100,
- "article": 200,
- "journal": 300,
- "resources": 400,
- "log_manager": 500,
- "log_manager_config": 600,
- "metrics": 700,
- "tasks": 800,
- "unexpected-error": 900,
+ "metadata": 100,
+ "resources": 200,
+ "log_manager": 300,
+ "tracker": 400,
+ "metrics": 500,
+ "tasks": 600,
}
def get_menu_order(app_name):
diff --git a/config/settings/base.py b/config/settings/base.py
index 4e96ed4..e4a99fa 100644
--- a/config/settings/base.py
+++ b/config/settings/base.py
@@ -5,7 +5,8 @@
from pathlib import Path
import environ
-from django.utils.translation import gettext_lazy as _
+
+from config.collections import COLLECTION_ACRON3_SIZE_MAP # noqa: F401
ROOT_DIR = Path(__file__).resolve(strict=True).parent.parent.parent
# core/
@@ -114,14 +115,15 @@
"core.users",
"core_settings",
# Your stuff: custom apps go here
- "article",
"collection",
"core",
- "journal",
+ "document",
"log_manager",
"log_manager_config",
"metrics",
+ "reports",
"resources",
+ "source",
"tracker",
]
@@ -404,36 +406,54 @@
SEARCH_PAGINATION_ITEMS_PER_PAGE = 10
-# Elasticsearch
+# OpenSearch
# ------------------------------------------------------------------------------
-ES_URL = env("ES_URL", default="http://192.168.0.33:9200/")
-ES_INDEX_NAME = env("ES_INDEX_NAME", default="usage")
-ES_API_KEY = env("ES_API_KEY", default="")
-ES_BASIC_AUTH = env("ES_BASIC_AUTH", default=("elastic", "iHktg66E"))
-ES_VERIFY_CERTS = env.bool("ES_VERIFY_CERTS", default=False)
+OPENSEARCH_URL = env("OPENSEARCH_URL", default="http://localhost:9200/")
+OPENSEARCH_INDEX_NAME = env("OPENSEARCH_INDEX_NAME", default="usage")
+OPENSEARCH_API_KEY = env("OPENSEARCH_API_KEY", default="")
+OPENSEARCH_BASIC_AUTH = env(
+ "OPENSEARCH_BASIC_AUTH",
+ default=("admin", "admin"),
+)
+OPENSEARCH_VERIFY_CERTS = env.bool(
+ "OPENSEARCH_VERIFY_CERTS",
+ default=False,
+)
+
+# Collectors configuration
+# ------------------------------------------------------------------------------
+# ArticleMeta
+ARTICLEMETA_COLLECT_URL = env(
+ "ARTICLEMETA_COLLECT_URL",
+ default="http://articlemeta.scielo.org/api/v1/article/counter_dict",
+)
+ARTICLEMETA_MAX_RETRIES = env.int("ARTICLEMETA_MAX_RETRIES", default=5)
+ARTICLEMETA_SLEEP_TIME = env.int("ARTICLEMETA_SLEEP_TIME", default=30)
+
+# Dataverse
+DATAVERSE_ENDPOINT = env("DATAVERSE_ENDPOINT", default="https://data.scielo.org/api")
+DATAVERSE_ROOT_COLLECTION = env("DATAVERSE_ROOT_COLLECTION", default="scielodata")
+DATAVERSE_SLEEP_TIME = env.int("DATAVERSE_SLEEP_TIME", default=30)
+
+# OPAC
+OPAC_ENDPOINT = env("OPAC_ENDPOINT", default="https://www.scielo.br/api/v1/counter_dict")
+OPAC_MAX_RETRIES = env.int("OPAC_MAX_RETRIES", default=5)
+OPAC_SLEEP_TIME = env.int("OPAC_SLEEP_TIME", default=30)
+
+# Preprints
+OAI_PMH_PREPRINT_ENDPOINT = env(
+ "OAI_PMH_PREPRINT_ENDPOINT",
+ default="https://preprints.scielo.org/index.php/scielo/oai",
+)
+OAI_METADATA_PREFIX = env("OAI_METADATA_PREFIX", default="oai_dc")
+OAI_PMH_MAX_RETRIES = env.int("OAI_PMH_MAX_RETRIES", default=5)
+
+# SciELO Books
+SCIELO_BOOKS_BASE_URL = env("SCIELO_BOOKS_BASE_URL", default="http://localhost:5984")
+SCIELO_BOOKS_TIMEOUT = env.int("SCIELO_BOOKS_TIMEOUT", default=60)
+SCIELO_BOOKS_DB_NAME = env("SCIELO_BOOKS_DB_NAME", default="scielobooks_1a")
+SCIELO_BOOKS_LIMIT = env.int("SCIELO_BOOKS_LIMIT", default=1000)
# Collection size categories
# ------------------------------------------------------------------------------
-EXTRA_LARGE_COLLECTIONS = env.list("EXTRA_LARGE_COLLECTIONS", default=["scl"])
-LARGE_COLLECTIONS = env.list("LARGE_COLLECTIONS", default=["chl", "col", "mex"])
-MEDIUM_COLLECTIONS = env.list("MEDIUM_COLLECTIONS", default=["cri", "esp", "psi", "prt", "ven"])
-SMALL_COLLECTIONS = env.list("SMALL_COLLECTIONS", default=["arg", "bol", "cub", "data", "ecu", "per", "preprints", "pry", "rve", "spa", "sss", "sza", "ury", "wid"])
-
-# Collection size mapping
-def _build_collection_size_map():
- """Build mapping of collection acronyms to their size categories."""
- size_map = {}
- size_categories = {
- "xlarge": EXTRA_LARGE_COLLECTIONS,
- "large": LARGE_COLLECTIONS,
- "medium": MEDIUM_COLLECTIONS,
- "small": SMALL_COLLECTIONS,
- }
-
- for size, collections in size_categories.items():
- for acron3 in collections:
- size_map[acron3] = size
-
- return size_map
-
-COLLECTION_ACRON3_SIZE_MAP = _build_collection_size_map()
+SUPPORTED_LOGFILE_EXTENSIONS = env.list("SUPPORTED_LOGFILE_EXTENSIONS", default=[".log", ".gz", ".zip"])
diff --git a/core/collectors/__init__.py b/core/collectors/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/core/collectors/__init__.py
@@ -0,0 +1 @@
+
diff --git a/core/collectors/articlemeta.py b/core/collectors/articlemeta.py
new file mode 100644
index 0000000..7f6ace0
--- /dev/null
+++ b/core/collectors/articlemeta.py
@@ -0,0 +1,60 @@
+import logging
+
+import requests
+from django.conf import settings
+from articlemeta.client import RestfulClient, ThriftClient
+from time import sleep
+
+
+def fetch_article_counter_dict(
+ from_date,
+ until_date,
+ offset=0,
+ limit=1000,
+ collection=None,
+ issn=None,
+):
+ for attempt in range(1, settings.ARTICLEMETA_MAX_RETRIES + 1):
+ params = {
+ "from": from_date,
+ "until": until_date,
+ "offset": offset,
+ "limit": limit,
+ }
+
+ if collection:
+ params["collection"] = collection
+
+ if issn:
+ params["issn"] = issn
+
+ response = requests.get(settings.ARTICLEMETA_COLLECT_URL, params=params)
+
+ try:
+ response.raise_for_status()
+ logging.info(response.url)
+ except requests.exceptions.HTTPError:
+ logging.warning(
+ "Failed to collect data from %s. Waiting %d seconds before retry %d of %d",
+ response.url,
+ settings.ARTICLEMETA_SLEEP_TIME,
+ attempt,
+ settings.ARTICLEMETA_MAX_RETRIES,
+ )
+ sleep(settings.ARTICLEMETA_SLEEP_TIME)
+ else:
+ return response.json()
+
+ return {}
+
+
+def iter_journals(collection="scl", mode="rest"):
+ if mode == "rest":
+ client = RestfulClient()
+ elif mode == "thrift":
+ client = ThriftClient()
+ else:
+ raise ValueError(f"Unsupported ArticleMeta mode: {mode}")
+
+ for journal in client.journals(collection=collection):
+ yield journal
diff --git a/core/collectors/dataverse.py b/core/collectors/dataverse.py
new file mode 100644
index 0000000..ca51fd7
--- /dev/null
+++ b/core/collectors/dataverse.py
@@ -0,0 +1,75 @@
+import logging
+
+import requests
+from django.conf import settings
+
+from core.utils import standardizer
+
+
+def _request_json(url):
+ try:
+ response = requests.get(url, timeout=settings.DATAVERSE_SLEEP_TIME)
+ response.raise_for_status()
+ return response.json()
+ except requests.exceptions.RequestException as exc:
+ logging.error("Error fetching %s: %s", url, exc)
+ return {}
+
+
+def _get_subdataverses():
+ url = f"{settings.DATAVERSE_ENDPOINT}/dataverses/{settings.DATAVERSE_ROOT_COLLECTION}/contents"
+ return _request_json(url).get("data", [])
+
+
+def _get_datasets(subdataverse_id):
+ url = f"{settings.DATAVERSE_ENDPOINT}/dataverses/{subdataverse_id}/contents"
+ return _request_json(url).get("data", [])
+
+
+def _get_files(dataset_id):
+ url = f"{settings.DATAVERSE_ENDPOINT}/datasets/{dataset_id}/versions/:latest/files"
+ return _request_json(url).get("data", [])
+
+
+def iter_dataset_metadata(from_date=None, until_date=None):
+ for subdataverse in _get_subdataverses():
+ if subdataverse.get("type") != "dataverse":
+ continue
+
+ subdataverse_id = subdataverse["id"]
+ subdataverse_title = subdataverse["title"]
+
+ for dataset in _get_datasets(subdataverse_id):
+ if dataset.get("type") != "dataset":
+ continue
+
+ dataset_id = dataset["id"]
+ doi = standardizer.standardize_doi(dataset.get("persistentUrl"))
+ if not doi:
+ logging.warning("Dataset %s does not have a DOI.", dataset_id)
+ continue
+
+ publication_date = dataset.get("publicationDate")
+ if publication_date:
+ if (from_date and publication_date < from_date) or (
+ until_date and publication_date > until_date
+ ):
+ continue
+
+ for file_data in _get_files(dataset_id):
+ file_persistent_id = file_data["dataFile"].get("persistentId")
+ standardized_persistent_id = (
+ standardizer.standardize_pid_generic(file_persistent_id)
+ if file_persistent_id
+ else None
+ )
+
+ yield {
+ "title": subdataverse_title,
+ "dataset_doi": doi,
+ "dataset_published": publication_date,
+ "file_id": file_data["dataFile"]["id"],
+ "file_name": file_data["label"],
+ "file_url": f"{settings.DATAVERSE_ENDPOINT}/access/datafile/{file_data['dataFile']['id']}",
+ "file_persistent_id": standardized_persistent_id,
+ }
diff --git a/core/collectors/opac.py b/core/collectors/opac.py
new file mode 100644
index 0000000..94122b7
--- /dev/null
+++ b/core/collectors/opac.py
@@ -0,0 +1,33 @@
+import logging
+
+import requests
+from django.conf import settings
+from time import sleep
+
+
+def fetch_counter_dict(from_date, until_date, page=1):
+ for attempt in range(1, settings.OPAC_MAX_RETRIES + 1):
+ params = {
+ "begin_date": from_date,
+ "end_date": until_date,
+ "page": page,
+ }
+
+ response = requests.get(url=settings.OPAC_ENDPOINT, params=params, verify=False)
+
+ try:
+ response.raise_for_status()
+ logging.info(response.url)
+ except requests.exceptions.HTTPError:
+ logging.warning(
+ "Could not collect data from %s. Waiting %d seconds for attempt %d of %d",
+ response.url,
+ settings.OPAC_SLEEP_TIME,
+ attempt,
+ settings.OPAC_MAX_RETRIES,
+ )
+ sleep(settings.OPAC_SLEEP_TIME)
+ else:
+ return response.json()
+
+ return {}
diff --git a/core/collectors/preprints.py b/core/collectors/preprints.py
new file mode 100644
index 0000000..bead72c
--- /dev/null
+++ b/core/collectors/preprints.py
@@ -0,0 +1,55 @@
+from django.conf import settings
+from sickle import Sickle
+
+from core.utils import standardizer
+
+
+def iter_records(from_date, until_date):
+ oai_client = Sickle(
+ endpoint=settings.OAI_PMH_PREPRINT_ENDPOINT,
+ max_retries=settings.OAI_PMH_MAX_RETRIES,
+ verify=False,
+ )
+ records = oai_client.ListRecords(
+ **{
+ "metadataPrefix": settings.OAI_METADATA_PREFIX,
+ "from": from_date,
+ "until": until_date,
+ }
+ )
+
+ for record in records:
+ yield record
+
+
+def extract_record_data(record):
+ pid_generic = _extract_compatible_identifier(record.header.identifier)
+ text_langs = [
+ standardizer.standardize_language_code(language)
+ for language in record.metadata.get("language", [])
+ ]
+ publication_date = record.metadata.get("date", [""])[0]
+ default_language = text_langs[0] if text_langs else ""
+ publication_year = _extract_publication_year_from_date(publication_date)
+
+ return {
+ "pid_generic": pid_generic,
+ "text_langs": text_langs,
+ "publication_date": publication_date,
+ "default_language": default_language,
+ "publication_year": publication_year,
+ }
+
+
+def _extract_compatible_identifier(identifier):
+ try:
+ return identifier.split(":")[-1].split("/")[1]
+ except IndexError:
+ return ""
+
+
+def _extract_publication_year_from_date(date_str):
+ try:
+ return date_str[:4]
+ except IndexError:
+ return ""
diff --git a/core/collectors/scielo_books.py b/core/collectors/scielo_books.py
new file mode 100644
index 0000000..b1f2dd8
--- /dev/null
+++ b/core/collectors/scielo_books.py
@@ -0,0 +1,182 @@
+import logging
+
+import requests
+from django.conf import settings
+from urllib.parse import urlencode
+
+
+
+
+def build_url(base_url, params=None):
+ if not params:
+ return base_url
+ return f"{base_url}?{urlencode(params, doseq=True)}"
+
+
+def sanitize_raw_data(payload):
+ if not isinstance(payload, dict):
+ return payload
+
+ if "_id" not in payload:
+ return payload
+
+ sanitized = dict(payload)
+ sanitized["id"] = sanitized.pop("_id")
+ return sanitized
+
+
+def fetch_document(doc_id, base_url=None, db_name=None, headers=None):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ resolved_base_url = base_url or settings.SCIELO_BOOKS_BASE_URL
+ if not resolved_base_url:
+ logging.error("Sem base url definida para coleta de books")
+ raise ValueError("SCIELO_BOOKS_BASE_URL is not configured")
+
+ url = f"{resolved_base_url}/{db_name}/{doc_id}"
+ response = requests.get(url, headers=headers, timeout=settings.SCIELO_BOOKS_TIMEOUT, verify=False)
+ response.raise_for_status()
+ payload = response.json()
+ return sanitize_raw_data(payload), url
+
+
+def fetch_changes_page(
+ base_url=None,
+ db_name=None,
+ since=0,
+ limit=None,
+ include_docs=False,
+ headers=None,
+):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ limit = limit or settings.SCIELO_BOOKS_LIMIT
+ resolved_base_url = base_url or settings.SCIELO_BOOKS_BASE_URL
+ if not resolved_base_url:
+ logging.error("Sem base url definida para coleta de books")
+ raise ValueError("SCIELO_BOOKS_BASE_URL is not configured")
+
+ params = {
+ "since": since,
+ "limit": limit,
+ }
+ if include_docs:
+ params["include_docs"] = "true"
+
+ url = build_url(f"{resolved_base_url}/{db_name}/_changes", params)
+ response = requests.get(url, headers=headers, timeout=settings.SCIELO_BOOKS_TIMEOUT, verify=False)
+ response.raise_for_status()
+ payload = response.json()
+ return payload if isinstance(payload, dict) else {}
+
+
+def extract_changes(payload):
+ if isinstance(payload, dict) and isinstance(payload.get("results"), list):
+ return payload.get("results")
+ return []
+
+
+def extract_last_seq(payload):
+ if isinstance(payload, dict):
+ return payload.get("last_seq") or payload.get("seq")
+ return None
+
+
+def iter_changes(
+ base_url=None,
+ db_name=None,
+ since=0,
+ limit=None,
+ headers=None,
+):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ limit = limit or settings.SCIELO_BOOKS_LIMIT
+ current_since = since or 0
+
+ while True:
+ payload = fetch_changes_page(
+ base_url=base_url,
+ db_name=db_name,
+ since=current_since,
+ limit=limit,
+ include_docs=False,
+ headers=headers,
+ )
+ changes = extract_changes(payload)
+ if not changes:
+ break
+
+ for change in changes:
+ yield change
+
+ last_seq = extract_last_seq(payload)
+ if last_seq is None or last_seq == current_since:
+ break
+ current_since = last_seq
+
+
+def iter_change_documents(
+ base_url=None,
+ db_name=None,
+ since=0,
+ limit=None,
+ headers=None,
+):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ limit = limit or settings.SCIELO_BOOKS_LIMIT
+ current_since = since or 0
+
+ while True:
+ payload = fetch_changes_page(
+ base_url=base_url,
+ db_name=db_name,
+ since=current_since,
+ limit=limit,
+ include_docs=True,
+ headers=headers,
+ )
+ changes = extract_changes(payload)
+ if not changes:
+ break
+
+ for change in changes:
+ doc_id = change.get("id")
+ if not doc_id:
+ continue
+
+ deleted = bool(change.get("deleted"))
+ raw_doc = change.get("doc") or {}
+ if deleted:
+ yield {
+ "change": change,
+ "deleted": True,
+ "payload": None,
+ "source_url": None,
+ }
+ continue
+
+ if raw_doc:
+ sanitized = sanitize_raw_data(raw_doc)
+ yield {
+ "change": change,
+ "deleted": False,
+ "payload": sanitized,
+ "source_url": f"{(base_url or settings.SCIELO_BOOKS_BASE_URL)}/{db_name}/{doc_id}",
+ }
+ continue
+
+ document_payload, source_url = fetch_document(
+ doc_id=doc_id,
+ base_url=base_url,
+ db_name=db_name,
+ headers=headers,
+ )
+ yield {
+ "change": change,
+ "deleted": False,
+ "payload": document_payload,
+ "source_url": source_url,
+ }
+
+ last_seq = extract_last_seq(payload)
+ if last_seq is None or last_seq == current_since:
+ break
+ current_since = last_seq
diff --git a/docs/Makefile b/docs/Makefile
deleted file mode 100644
index 6957700..0000000
--- a/docs/Makefile
+++ /dev/null
@@ -1,29 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS ?=
-SPHINXBUILD ?= sphinx-build
-SOURCEDIR = .
-BUILDDIR = ./_build
-APP = /app
-
-.PHONY: help livehtml apidocs Makefile
-
-# Put it first so that "make" without argument is like "make help".
-help:
- @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -c .
-
-# Build, watch and serve docs with live reload
-livehtml:
- sphinx-autobuild -b html --host 0.0.0.0 --port 9000 --watch $(APP) -c . $(SOURCEDIR) $(BUILDDIR)/html
-
-# Outputs rst files from django application code
-apidocs:
- sphinx-apidoc -o $(SOURCEDIR)/api $(APP)
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
- @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -c .
diff --git a/docs/__init__.py b/docs/__init__.py
deleted file mode 100644
index 8772c82..0000000
--- a/docs/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Included so that Django's startproject comment runs against the docs directory
diff --git a/docs/conf.py b/docs/conf.py
deleted file mode 100644
index 51cd921..0000000
--- a/docs/conf.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-
-import os
-import sys
-
-import django
-
-if os.getenv("READTHEDOCS", default=False) == "True":
- sys.path.insert(0, os.path.abspath(".."))
- os.environ["DJANGO_READ_DOT_ENV_FILE"] = "True"
- os.environ["USE_DOCKER"] = "no"
-else:
- sys.path.insert(0, os.path.abspath("/app"))
-os.environ["DATABASE_URL"] = "sqlite:///readthedocs.db"
-os.environ["CELERY_BROKER_URL"] = os.getenv("REDIS_URL", "redis://redis:6379")
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.local")
-django.setup()
-
-# -- Project information -----------------------------------------------------
-
-project = "SciELO Core"
-copyright = """2022, SciELO"""
-author = "SciELO"
-
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
- "sphinx.ext.autodoc",
- "sphinx.ext.napoleon",
-]
-
-# Add any paths that contain templates here, relative to this directory.
-# templates_path = ["_templates"]
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages. See the documentation for
-# a list of builtin themes.
-#
-html_theme = "alabaster"
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ["_static"]
diff --git a/docs/howto.rst b/docs/howto.rst
deleted file mode 100644
index 9fae300..0000000
--- a/docs/howto.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-How To - Project Documentation
-======================================================================
-
-Get Started
-----------------------------------------------------------------------
-
-Documentation can be written as rst files in `core/docs`.
-
-
-To build and serve docs, use the commands::
-
- docker compose -f local.yml up docs
-
-
-
-Changes to files in `docs/_source` will be picked up and reloaded automatically.
-
-`Sphinx `_ is the tool used to build documentation.
-
-Docstrings to Documentation
-----------------------------------------------------------------------
-
-The sphinx extension `apidoc `_ is used to automatically document code using signatures and docstrings.
-
-Numpy or Google style docstrings will be picked up from project files and availble for documentation. See the `Napoleon `_ extension for details.
-
-For an in-use example, see the `page source <_sources/users.rst.txt>`_ for :ref:`users`.
-
-To compile all docstrings automatically into documentation source files, use the command:
- ::
-
- make apidocs
-
-
-This can be done in the docker container:
- ::
-
- docker run --rm docs make apidocs
diff --git a/docs/index.rst b/docs/index.rst
deleted file mode 100644
index b6c6ded..0000000
--- a/docs/index.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-.. SciELO Content Manager documentation master file, created by
- sphinx-quickstart.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
-
-Welcome to SciELO Core's documentation!
-======================================================================
-
-.. toctree::
- :maxdepth: 2
- :caption: Contents:
-
- howto
- users
-
-
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
diff --git a/docs/make.bat b/docs/make.bat
deleted file mode 100644
index 4f70eed..0000000
--- a/docs/make.bat
+++ /dev/null
@@ -1,46 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-
-if "%SPHINXBUILD%" == "" (
- set SPHINXBUILD=sphinx-build -c .
-)
-set SOURCEDIR=_source
-set BUILDDIR=_build
-set APP=..\core
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
- echo.
- echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
- echo.installed, then set the SPHINXBUILD environment variable to point
- echo.to the full path of the 'sphinx-build' executable. Alternatively you
- echo.may add the Sphinx directory to PATH.
- echo.
- echo.Install sphinx-autobuild for live serving.
- echo.If you don't have Sphinx installed, grab it from
- echo.http://sphinx-doc.org/
- exit /b 1
-)
-
-%SPHINXBUILD% -b %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:livehtml
-sphinx-autobuild -b html --open-browser -p 9000 --watch %APP% -c . %SOURCEDIR% %BUILDDIR%/html
-GOTO :EOF
-
-:apidocs
-sphinx-apidoc -o %SOURCEDIR%/api %APP%
-GOTO :EOF
-
-:help
-%SPHINXBUILD% -b help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
diff --git a/docs/users.rst b/docs/users.rst
deleted file mode 100644
index 21e08aa..0000000
--- a/docs/users.rst
+++ /dev/null
@@ -1,15 +0,0 @@
- .. _users:
-
-Users
-======================================================================
-
-Starting a new project, it’s highly recommended to set up a custom user model,
-even if the default User model is sufficient for you.
-
-This model behaves identically to the default user model,
-but you’ll be able to customize it in the future if the need arises.
-
-.. automodule:: core.users.models
- :members:
- :noindex:
-
diff --git a/document/__init__.py b/document/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/document/__init__.py
@@ -0,0 +1 @@
+
diff --git a/journal/apps.py b/document/apps.py
similarity index 62%
rename from journal/apps.py
rename to document/apps.py
index e10a171..eb482d2 100644
--- a/journal/apps.py
+++ b/document/apps.py
@@ -1,6 +1,6 @@
from django.apps import AppConfig
-class JournalConfig(AppConfig):
+class DocumentConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
- name = "journal"
+ name = "document"
diff --git a/document/management/__init__.py b/document/management/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/document/management/__init__.py
@@ -0,0 +1 @@
+
diff --git a/document/management/commands/__init__.py b/document/management/commands/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/document/management/commands/__init__.py
@@ -0,0 +1 @@
+
diff --git a/document/management/commands/load_articles_by_year.py b/document/management/commands/load_articles_by_year.py
new file mode 100644
index 0000000..a922456
--- /dev/null
+++ b/document/management/commands/load_articles_by_year.py
@@ -0,0 +1,80 @@
+from django.core.management.base import BaseCommand
+
+from document.tasks import task_load_documents_from_article_meta
+from document.tasks import task_load_documents_from_opac
+
+
+class Command(BaseCommand):
+ help = "Generate task requests for loading document data by year"
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ "--start-year",
+ type=int,
+ default=1990,
+ help="Start year (default: 1990)",
+ )
+ parser.add_argument(
+ "--end-year",
+ type=int,
+ default=2025,
+ help="End year (default: 2025)",
+ )
+ parser.add_argument(
+ "--collection",
+ type=str,
+ default="scl",
+ help="Collection code (default: scl)",
+ )
+ parser.add_argument(
+ "--task",
+ choices=["load_documents_from_opac", "load_documents_from_article_meta"],
+ default="load_documents_from_opac",
+ help="Task to execute (default: load_documents_from_opac)",
+ )
+
+ def handle(self, *args, **options):
+ start_year = options["start_year"]
+ end_year = options["end_year"]
+ collection = options["collection"]
+
+ self.stdout.write(
+ self.style.SUCCESS(
+ f"Generating task requests from {start_year} to {end_year} for collection: {collection}"
+ )
+ )
+
+ total_tasks = 0
+
+ for year in range(start_year, end_year + 1):
+ from_date = f"{year}-01-01"
+ until_date = f"{year}-12-31"
+
+ self.stdout.write(f"Queuing task for year {year}...")
+
+ if options["task"] == "load_documents_from_article_meta":
+ task_result = task_load_documents_from_article_meta.delay(
+ from_date=from_date,
+ until_date=until_date,
+ collection=collection,
+ )
+ else:
+ task_result = task_load_documents_from_opac.delay(
+ from_date=from_date,
+ until_date=until_date,
+ collection=collection,
+ )
+
+ total_tasks += 1
+
+ self.stdout.write(
+ self.style.SUCCESS(
+ f"✓ Task queued for year {year}: {from_date} to {until_date} (Task ID: {task_result.id})"
+ )
+ )
+
+ self.stdout.write(
+ self.style.SUCCESS(
+ f"\nCompleted! {total_tasks} tasks have been queued successfully."
+ )
+ )
diff --git a/document/migrations/0001_initial.py b/document/migrations/0001_initial.py
new file mode 100644
index 0000000..bff11be
--- /dev/null
+++ b/document/migrations/0001_initial.py
@@ -0,0 +1,279 @@
+# Generated by Django 5.0.7 on 2026-03-15 00:00
+
+import django.db.models.deletion
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ initial = True
+
+ dependencies = [
+ ("collection", "0001_initial"),
+ ("source", "0001_initial"),
+ migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name="Document",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ (
+ "created",
+ models.DateTimeField(
+ auto_now_add=True,
+ verbose_name="Creation date",
+ ),
+ ),
+ (
+ "updated",
+ models.DateTimeField(
+ auto_now=True,
+ verbose_name="Last update date",
+ ),
+ ),
+ (
+ "document_type",
+ models.CharField(
+ choices=[
+ ("article", "Article"),
+ ("preprint", "Preprint"),
+ ("dataset", "Dataset"),
+ ("book", "Book"),
+ ("chapter", "Chapter"),
+ ("other", "Other"),
+ ],
+ db_index=True,
+ max_length=32,
+ verbose_name="Document Type",
+ ),
+ ),
+ (
+ "document_id",
+ models.CharField(
+ db_index=True,
+ max_length=255,
+ verbose_name="Document ID",
+ ),
+ ),
+ (
+ "scielo_issn",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ max_length=9,
+ null=True,
+ verbose_name="SciELO ISSN",
+ ),
+ ),
+ (
+ "pid_v2",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ max_length=23,
+ null=True,
+ verbose_name="PID V2",
+ ),
+ ),
+ (
+ "pid_v3",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ max_length=23,
+ null=True,
+ verbose_name="PID V3",
+ ),
+ ),
+ (
+ "pid_generic",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ max_length=255,
+ null=True,
+ verbose_name="PID Generic",
+ ),
+ ),
+ (
+ "title",
+ models.CharField(
+ blank=True,
+ max_length=500,
+ null=True,
+ verbose_name="Document Title",
+ ),
+ ),
+ (
+ "identifiers",
+ models.JSONField(
+ blank=True,
+ default=dict,
+ null=True,
+ verbose_name="Identifiers",
+ ),
+ ),
+ (
+ "files",
+ models.JSONField(
+ blank=True,
+ default=dict,
+ null=True,
+ verbose_name="Files",
+ ),
+ ),
+ (
+ "default_lang",
+ models.CharField(
+ blank=True,
+ max_length=8,
+ null=True,
+ verbose_name="Default Language",
+ ),
+ ),
+ (
+ "text_langs",
+ models.JSONField(
+ blank=True,
+ default=list,
+ null=True,
+ verbose_name="Text Languages",
+ ),
+ ),
+ (
+ "default_media_format",
+ models.CharField(
+ blank=True,
+ max_length=32,
+ null=True,
+ verbose_name="Default Media Format",
+ ),
+ ),
+ (
+ "processing_date",
+ models.CharField(
+ blank=True,
+ max_length=32,
+ null=True,
+ verbose_name="Processing Date",
+ ),
+ ),
+ (
+ "publication_date",
+ models.CharField(
+ blank=True,
+ max_length=32,
+ null=True,
+ verbose_name="Publication Date",
+ ),
+ ),
+ (
+ "publication_year",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ max_length=4,
+ null=True,
+ verbose_name="Publication Year",
+ ),
+ ),
+ (
+ "extra_data",
+ models.JSONField(
+ blank=True,
+ default=dict,
+ null=True,
+ verbose_name="Extra Data",
+ ),
+ ),
+ (
+ "collection",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to="collection.collection",
+ verbose_name="Collection",
+ ),
+ ),
+ (
+ "creator",
+ models.ForeignKey(
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_creator",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Creator",
+ ),
+ ),
+ (
+ "parent_document",
+ models.ForeignKey(
+ blank=True,
+ db_index=True,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="child_documents",
+ to="document.document",
+ verbose_name="Parent Document",
+ ),
+ ),
+ (
+ "source",
+ models.ForeignKey(
+ blank=True,
+ db_index=True,
+ null=True,
+ on_delete=django.db.models.deletion.CASCADE,
+ related_name="documents",
+ to="source.source",
+ verbose_name="Source",
+ ),
+ ),
+ (
+ "updated_by",
+ models.ForeignKey(
+ blank=True,
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_last_mod_user",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Updater",
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "Document",
+ "verbose_name_plural": "Documents",
+ "unique_together": {("collection", "document_type", "document_id")},
+ "indexes": [
+ models.Index(
+ fields=["collection", "document_type"],
+ name="document_collection_type_idx",
+ ),
+ models.Index(
+ fields=["collection", "scielo_issn"],
+ name="document_collection_issn_idx",
+ ),
+ models.Index(
+ fields=["collection", "pid_v2"],
+ name="document_collection_pidv2_idx",
+ ),
+ models.Index(
+ fields=["collection", "pid_generic"],
+ name="doc_coll_pidgen_idx",
+ ),
+ ],
+ },
+ ),
+ ]
diff --git a/document/migrations/__init__.py b/document/migrations/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/document/migrations/__init__.py
@@ -0,0 +1 @@
+
diff --git a/document/models.py b/document/models.py
new file mode 100644
index 0000000..5197692
--- /dev/null
+++ b/document/models.py
@@ -0,0 +1,258 @@
+from django.db import models
+from django.utils.translation import gettext_lazy as _
+
+from collection.models import Collection
+from core.models import CommonControlField
+from source.models import Source
+
+
+class Document(CommonControlField):
+ DOCUMENT_TYPE_ARTICLE = "article"
+ DOCUMENT_TYPE_PREPRINT = "preprint"
+ DOCUMENT_TYPE_DATASET = "dataset"
+ DOCUMENT_TYPE_BOOK = "book"
+ DOCUMENT_TYPE_CHAPTER = "chapter"
+ DOCUMENT_TYPE_OTHER = "other"
+ DOCUMENT_TYPE_CHOICES = (
+ (DOCUMENT_TYPE_ARTICLE, _("Article")),
+ (DOCUMENT_TYPE_PREPRINT, _("Preprint")),
+ (DOCUMENT_TYPE_DATASET, _("Dataset")),
+ (DOCUMENT_TYPE_BOOK, _("Book")),
+ (DOCUMENT_TYPE_CHAPTER, _("Chapter")),
+ (DOCUMENT_TYPE_OTHER, _("Other")),
+ )
+
+ collection = models.ForeignKey(
+ Collection,
+ verbose_name=_("Collection"),
+ on_delete=models.CASCADE,
+ blank=False,
+ null=False,
+ db_index=True,
+ )
+
+ source = models.ForeignKey(
+ Source,
+ verbose_name=_("Source"),
+ on_delete=models.CASCADE,
+ related_name="documents",
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ parent_document = models.ForeignKey(
+ "self",
+ verbose_name=_("Parent Document"),
+ on_delete=models.SET_NULL,
+ related_name="child_documents",
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ document_type = models.CharField(
+ verbose_name=_("Document Type"),
+ max_length=32,
+ choices=DOCUMENT_TYPE_CHOICES,
+ blank=False,
+ null=False,
+ db_index=True,
+ )
+
+ document_id = models.CharField(
+ verbose_name=_("Document ID"),
+ max_length=255,
+ blank=False,
+ null=False,
+ db_index=True,
+ )
+
+ scielo_issn = models.CharField(
+ verbose_name=_("SciELO ISSN"),
+ max_length=9,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ pid_v2 = models.CharField(
+ verbose_name=_("PID V2"),
+ max_length=23,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ pid_v3 = models.CharField(
+ verbose_name=_("PID V3"),
+ max_length=23,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ pid_generic = models.CharField(
+ verbose_name=_("PID Generic"),
+ max_length=255,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ title = models.CharField(
+ verbose_name=_("Document Title"),
+ max_length=500,
+ blank=True,
+ null=True,
+ )
+
+ identifiers = models.JSONField(
+ verbose_name=_("Identifiers"),
+ null=True,
+ blank=True,
+ default=dict,
+ )
+
+ files = models.JSONField(
+ verbose_name=_("Files"),
+ null=True,
+ blank=True,
+ default=dict,
+ )
+
+ default_lang = models.CharField(
+ verbose_name=_("Default Language"),
+ max_length=8,
+ blank=True,
+ null=True,
+ )
+
+ text_langs = models.JSONField(
+ verbose_name=_("Text Languages"),
+ null=True,
+ blank=True,
+ default=list,
+ )
+
+ default_media_format = models.CharField(
+ verbose_name=_("Default Media Format"),
+ max_length=32,
+ blank=True,
+ null=True,
+ )
+
+ processing_date = models.CharField(
+ verbose_name=_("Processing Date"),
+ max_length=32,
+ blank=True,
+ null=True,
+ )
+
+ publication_date = models.CharField(
+ verbose_name=_("Publication Date"),
+ max_length=32,
+ blank=True,
+ null=True,
+ )
+
+ publication_year = models.CharField(
+ verbose_name=_("Publication Year"),
+ max_length=4,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ extra_data = models.JSONField(
+ verbose_name=_("Extra Data"),
+ null=True,
+ blank=True,
+ default=dict,
+ )
+
+ def __str__(self):
+ return f"{self.collection.acron3} - {self.document_type} - {self.document_id}"
+
+ @classmethod
+ def metadata(cls, collection=None):
+ queryset = cls.objects.select_related("collection", "source").only(
+ "collection__acron3",
+ "default_lang",
+ "default_media_format",
+ "document_id",
+ "document_type",
+ "extra_data",
+ "files",
+ "identifiers",
+ "parent_document__document_id",
+ "pid_generic",
+ "pid_v2",
+ "pid_v3",
+ "processing_date",
+ "publication_date",
+ "publication_year",
+ "scielo_issn",
+ "source__scielo_issn",
+ "source__source_id",
+ "source__source_type",
+ "text_langs",
+ "title",
+ )
+
+ if collection:
+ queryset = queryset.filter(collection=collection)
+
+ for document in queryset.iterator():
+ source = document.source
+ yield {
+ "collection": document.collection.acron3,
+ "default_lang": document.default_lang,
+ "default_media_format": document.default_media_format,
+ "document_id": document.document_id,
+ "document_type": document.document_type,
+ "extra_data": document.extra_data or {},
+ "files": document.files or {},
+ "identifiers": document.identifiers or {},
+ "parent_document_id": (
+ document.parent_document.document_id if document.parent_document else None
+ ),
+ "pid_generic": document.pid_generic,
+ "pid_v2": document.pid_v2,
+ "pid_v3": document.pid_v3,
+ "processing_date": document.processing_date,
+ "publication_date": document.publication_date,
+ "publication_year": document.publication_year,
+ "scielo_issn": document.scielo_issn or (source.scielo_issn if source else None),
+ "source_id": source.source_id if source else None,
+ "source_type": source.source_type if source else None,
+ "text_langs": document.text_langs or [],
+ "title": document.title,
+ }
+
+ class Meta:
+ verbose_name = _("Document")
+ verbose_name_plural = _("Documents")
+ unique_together = (
+ "collection",
+ "document_type",
+ "document_id",
+ )
+ indexes = [
+ models.Index(
+ fields=["collection", "document_type"],
+ name="document_collection_type_idx",
+ ),
+ models.Index(
+ fields=["collection", "scielo_issn"],
+ name="document_collection_issn_idx",
+ ),
+ models.Index(
+ fields=["collection", "pid_v2"],
+ name="document_collection_pidv2_idx",
+ ),
+ models.Index(
+ fields=["collection", "pid_generic"],
+ name="doc_coll_pidgen_idx",
+ ),
+ ]
diff --git a/document/services/__init__.py b/document/services/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/document/services/__init__.py
@@ -0,0 +1 @@
+
diff --git a/document/services/articles.py b/document/services/articles.py
new file mode 100644
index 0000000..09244b3
--- /dev/null
+++ b/document/services/articles.py
@@ -0,0 +1,166 @@
+from document.models import Document
+
+from .common import build_document_id, compact_dict, get_existing_document, normalize_langs, normalize_year
+
+
+def upsert_article_document_from_articlemeta(
+ payload,
+ collection,
+ source=None,
+ user=None,
+ force_update=True,
+):
+ pid_v2 = payload.get("code")
+ document_id = build_document_id(pid_v2, payload.get("pid_v3"), payload.get("pid_generic"))
+ if not document_id:
+ return None
+
+ document = get_existing_document(
+ collection,
+ Document.DOCUMENT_TYPE_ARTICLE,
+ document_id,
+ pid_v2,
+ )
+ created = document is None
+ if created:
+ document = Document(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_ARTICLE,
+ document_id=document_id,
+ )
+ if user:
+ document.creator = user
+
+ if created or force_update:
+ document.source = source
+ document.parent_document = None
+ document.scielo_issn = source.scielo_issn if source else None
+ document.pid_v2 = pid_v2 or document.pid_v2
+ document.pid_v3 = payload.get("pid_v3") or document.pid_v3
+ document.pid_generic = payload.get("pid_generic") or document.pid_generic
+ document.title = payload.get("title") or document.title
+ document.identifiers = _merge_dicts(
+ document.identifiers,
+ _build_articlemeta_identifiers(payload, source),
+ )
+ document.files = payload.get("pdfs") or document.files or {}
+ document.default_lang = payload.get("default_language") or document.default_lang
+ document.text_langs = normalize_langs(payload.get("text_langs"))
+ document.default_media_format = document.default_media_format
+ document.processing_date = payload.get("processing_date") or document.processing_date
+ document.publication_date = payload.get("publication_date") or document.publication_date
+ document.publication_year = normalize_year(
+ payload.get("publication_year"),
+ fallback_date=document.publication_date,
+ )
+ document.extra_data = _merge_dicts(
+ document.extra_data,
+ compact_dict(
+ {
+ "provider": "articlemeta",
+ "issn_codes": payload.get("code_title"),
+ }
+ ),
+ )
+
+ if user:
+ document.updated_by = user
+
+ document.save()
+ return document
+
+
+def upsert_article_document_from_opac(
+ payload,
+ collection,
+ source=None,
+ user=None,
+ force_update=True,
+):
+ pid_v2 = payload.get("pid_v2")
+ pid_v3 = payload.get("pid_v3")
+ document_id = build_document_id(pid_v2, pid_v3, payload.get("pid_generic"))
+ if not document_id:
+ return None
+
+ document = get_existing_document(
+ collection,
+ Document.DOCUMENT_TYPE_ARTICLE,
+ document_id,
+ pid_v2,
+ pid_v3,
+ payload.get("pid_generic"),
+ )
+ created = document is None
+ if created:
+ document = Document(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_ARTICLE,
+ document_id=document_id,
+ )
+ if user:
+ document.creator = user
+
+ if created or force_update:
+ document.source = source
+ document.parent_document = None
+ document.scielo_issn = source.scielo_issn if source else None
+ document.pid_v2 = pid_v2 or document.pid_v2
+ document.pid_v3 = pid_v3 or document.pid_v3
+ document.pid_generic = payload.get("pid_generic") or document.pid_generic
+ document.title = payload.get("title") or document.title
+ document.identifiers = _merge_dicts(
+ document.identifiers,
+ _build_opac_identifiers(payload, source),
+ )
+ document.files = document.files or {}
+ document.default_lang = payload.get("default_language") or document.default_lang
+ document.text_langs = normalize_langs(payload.get("text_langs")) or document.text_langs or []
+ document.default_media_format = document.default_media_format
+ document.processing_date = document.processing_date
+ document.publication_date = payload.get("publication_date") or document.publication_date
+ document.publication_year = normalize_year(
+ payload.get("publication_year"),
+ fallback_date=document.publication_date,
+ )
+ document.extra_data = _merge_dicts(
+ document.extra_data,
+ compact_dict(
+ {
+ "provider": "opac",
+ "journal_acronym": payload.get("journal_acronym"),
+ }
+ ),
+ )
+
+ if user:
+ document.updated_by = user
+
+ document.save()
+ return document
+
+
+def _build_articlemeta_identifiers(payload, source):
+ return compact_dict(
+ {
+ "pid_v2": payload.get("code"),
+ "scielo_issn": source.scielo_issn if source else None,
+ }
+ )
+
+
+def _build_opac_identifiers(payload, source):
+ return compact_dict(
+ {
+ "pid_v2": payload.get("pid_v2"),
+ "pid_v3": payload.get("pid_v3"),
+ "scielo_issn": source.scielo_issn if source else None,
+ "journal_acronym": payload.get("journal_acronym"),
+ }
+ )
+
+
+def _merge_dicts(current, new_values):
+ merged = dict(current or {})
+ merged.update(new_values or {})
+ return merged
diff --git a/document/services/books.py b/document/services/books.py
new file mode 100644
index 0000000..96d92e1
--- /dev/null
+++ b/document/services/books.py
@@ -0,0 +1,256 @@
+from document.models import Document
+
+
+def build_book_pid_generic(book_id):
+ if book_id in (None, ""):
+ return None
+ return f"book:{book_id}"
+
+
+def build_chapter_pid_generic(book_id, chapter_id):
+ if book_id in (None, "") or chapter_id in (None, ""):
+ return None
+ return f"book:{book_id}/chapter:{chapter_id}"
+
+
+def enrich_part_payload(payload, monograph_payload):
+ if not monograph_payload:
+ return payload
+
+ enriched = dict(payload)
+ enriched["monograph_title"] = monograph_payload.get("title")
+ enriched["monograph_language"] = monograph_payload.get("language")
+ enriched["monograph_publication_date"] = monograph_payload.get("publication_date")
+ enriched["monograph_year"] = monograph_payload.get("year")
+ enriched["monograph_publisher"] = monograph_payload.get("publisher")
+ enriched["monograph_isbn"] = monograph_payload.get("isbn")
+ enriched["monograph_eisbn"] = monograph_payload.get("eisbn")
+ enriched["monograph_doi_number"] = monograph_payload.get("doi_number")
+ enriched["monograph_creators"] = monograph_payload.get("creators")
+ return enriched
+
+
+def upsert_monograph_document(
+ payload,
+ collection,
+ source=None,
+ user=None,
+ force_update=True,
+ source_url=None,
+ last_seq=None,
+):
+ if payload.get("TYPE") != "Monograph":
+ return None
+
+ book_id = str(payload.get("id"))
+ pid_generic = build_book_pid_generic(book_id)
+ document, created = Document.objects.get_or_create(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_BOOK,
+ document_id=pid_generic,
+ )
+
+ if created and user:
+ document.creator = user
+
+ if created or force_update:
+ document.source = source
+ document.parent_document = None
+ document.scielo_issn = None
+ document.pid_v2 = None
+ document.pid_v3 = None
+ document.pid_generic = pid_generic
+ document.title = payload.get("title") or book_id
+ document.identifiers = _build_monograph_identifiers(payload)
+ document.files = {}
+ document.default_lang = payload.get("language") or None
+ document.text_langs = _unique_list(payload.get("language"))
+ document.default_media_format = None
+ document.processing_date = None
+ document.publication_date = payload.get("publication_date") or None
+ document.publication_year = _normalize_year(payload.get("year"))
+ document.extra_data = _build_monograph_extra_data(
+ payload,
+ source_url=source_url,
+ last_seq=last_seq,
+ )
+
+ if user:
+ document.updated_by = user
+
+ document.save()
+ return document
+
+
+def upsert_part_document(
+ payload,
+ collection,
+ source=None,
+ parent_document=None,
+ user=None,
+ force_update=True,
+ source_url=None,
+ last_seq=None,
+):
+ if payload.get("TYPE") != "Part":
+ return None
+
+ book_id = payload.get("monograph")
+ chapter_id = payload.get("id")
+ pid_generic = build_chapter_pid_generic(book_id, chapter_id)
+ document, created = Document.objects.get_or_create(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_CHAPTER,
+ document_id=pid_generic,
+ )
+
+ if created and user:
+ document.creator = user
+
+ if created or force_update:
+ document.source = source
+ document.parent_document = parent_document
+ document.scielo_issn = None
+ document.pid_v2 = None
+ document.pid_v3 = None
+ document.pid_generic = pid_generic
+ document.title = payload.get("title") or str(chapter_id)
+ document.identifiers = _build_part_identifiers(payload)
+ document.files = {}
+ document.default_lang = (
+ payload.get("text_language")
+ or payload.get("monograph_language")
+ or None
+ )
+ document.text_langs = _unique_list(
+ payload.get("text_language") or payload.get("monograph_language")
+ )
+ document.default_media_format = None
+ document.processing_date = None
+ document.publication_date = payload.get("monograph_publication_date") or None
+ document.publication_year = _normalize_year(payload.get("monograph_year"))
+ document.extra_data = _build_part_extra_data(
+ payload,
+ source_url=source_url,
+ last_seq=last_seq,
+ )
+
+ if user:
+ document.updated_by = user
+
+ document.save()
+ return document
+
+
+def delete_book_document(collection, book_id):
+ return Document.objects.filter(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_BOOK,
+ document_id=build_book_pid_generic(book_id),
+ ).delete()
+
+
+def delete_document_by_raw_id(collection, raw_id):
+ return Document.objects.filter(
+ collection=collection,
+ extra_data__raw_id=str(raw_id),
+ ).delete()
+
+
+def has_monograph_document_for_raw_id(collection, raw_id):
+ return Document.objects.filter(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_BOOK,
+ extra_data__raw_id=str(raw_id),
+ ).exists()
+
+
+def get_monograph_document(collection, book_id):
+ return Document.objects.filter(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_BOOK,
+ document_id=build_book_pid_generic(book_id),
+ ).first()
+
+
+def _build_monograph_identifiers(payload):
+ identifiers = {
+ "book_id": str(payload.get("id")) if payload.get("id") is not None else None,
+ "isbn": payload.get("isbn"),
+ "eisbn": payload.get("eisbn"),
+ "doi": payload.get("doi_number"),
+ }
+ return _compact_dict(identifiers)
+
+
+def _build_part_identifiers(payload):
+ identifiers = {
+ "book_id": str(payload.get("monograph")) if payload.get("monograph") is not None else None,
+ "chapter_id": str(payload.get("id")) if payload.get("id") is not None else None,
+ "isbn": payload.get("monograph_isbn"),
+ "eisbn": payload.get("monograph_eisbn"),
+ "doi": payload.get("doi_number"),
+ "book_doi": payload.get("monograph_doi_number"),
+ }
+ return _compact_dict(identifiers)
+
+
+def _build_monograph_extra_data(payload, source_url=None, last_seq=None):
+ extra_data = {
+ "raw_id": str(payload.get("id")) if payload.get("id") is not None else None,
+ "raw_type": payload.get("TYPE"),
+ "source_url": source_url,
+ "last_seq": last_seq,
+ "visible": payload.get("visible"),
+ "city": payload.get("city"),
+ "country": payload.get("country"),
+ "pages": payload.get("pages"),
+ "publisher": payload.get("publisher"),
+ "creators": payload.get("creators"),
+ "translated_titles": payload.get("translated_titles"),
+ "translated_synopses": payload.get("translated_synopses"),
+ "synopsis": payload.get("synopsis"),
+ }
+ return _compact_dict(extra_data)
+
+
+def _build_part_extra_data(payload, source_url=None, last_seq=None):
+ extra_data = {
+ "raw_id": str(payload.get("id")) if payload.get("id") is not None else None,
+ "raw_type": payload.get("TYPE"),
+ "source_url": source_url,
+ "last_seq": last_seq,
+ "visible": payload.get("visible"),
+ "order": payload.get("order"),
+ "pages": payload.get("pages"),
+ "creators": payload.get("creators"),
+ "translated_titles": payload.get("translated_titles"),
+ "monograph_id": str(payload.get("monograph")) if payload.get("monograph") is not None else None,
+ "monograph_title": payload.get("monograph_title"),
+ "monograph_language": payload.get("monograph_language"),
+ "monograph_publication_date": payload.get("monograph_publication_date"),
+ "monograph_year": payload.get("monograph_year"),
+ "monograph_publisher": payload.get("monograph_publisher"),
+ "monograph_creators": payload.get("monograph_creators"),
+ }
+ return _compact_dict(extra_data)
+
+
+def _unique_list(value):
+ if not value:
+ return []
+ return [value]
+
+
+def _normalize_year(value):
+ if value in (None, ""):
+ return None
+ return str(value)[:4]
+
+
+def _compact_dict(data):
+ return {
+ key: value
+ for key, value in data.items()
+ if value not in (None, "", [], {}, ())
+ }
diff --git a/document/services/common.py b/document/services/common.py
new file mode 100644
index 0000000..91e103d
--- /dev/null
+++ b/document/services/common.py
@@ -0,0 +1,58 @@
+from document.models import Document
+
+
+def build_document_id(*values):
+ for value in values:
+ if value not in (None, ""):
+ return str(value)
+ return None
+
+
+def get_existing_document(collection, document_type, *identifiers):
+ identifiers = [str(value) for value in identifiers if value not in (None, "")]
+ if not identifiers:
+ return None
+
+ queryset = Document.objects.filter(
+ collection=collection,
+ document_type=document_type,
+ )
+
+ for field_name in ("document_id", "pid_v2", "pid_v3", "pid_generic"):
+ for identifier in identifiers:
+ document = queryset.filter(**{field_name: identifier}).first()
+ if document:
+ return document
+
+ return None
+
+
+def normalize_langs(value):
+ if not value:
+ return []
+
+ if isinstance(value, list):
+ return [item for item in value if item not in (None, "")]
+
+ if isinstance(value, dict):
+ return [key for key, enabled in value.items() if enabled]
+
+ return [value]
+
+
+def normalize_year(value, fallback_date=None):
+ if value not in (None, ""):
+ return str(value)[:4]
+
+ if fallback_date not in (None, ""):
+ return str(fallback_date)[:4]
+
+ return None
+
+
+def compact_dict(data):
+ return {
+ key: value
+ for key, value in data.items()
+ if value not in (None, "", [], {}, ())
+ }
diff --git a/document/services/datasets.py b/document/services/datasets.py
new file mode 100644
index 0000000..2496b20
--- /dev/null
+++ b/document/services/datasets.py
@@ -0,0 +1,69 @@
+from document.models import Document
+
+from .common import compact_dict, normalize_year
+
+
+def upsert_dataset_document(
+ payload,
+ collection,
+ user=None,
+ force_update=True,
+):
+ dataset_doi = payload.get("dataset_doi")
+ if not dataset_doi:
+ return None
+
+ document, created = Document.objects.get_or_create(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_DATASET,
+ document_id=dataset_doi,
+ )
+
+ if created and user:
+ document.creator = user
+
+ if created or force_update:
+ files = dict(document.files or {})
+ file_id = payload.get("file_id")
+ if file_id:
+ files[str(file_id)] = compact_dict(
+ {
+ "name": payload.get("file_name"),
+ "url": payload.get("file_url"),
+ "file_persistent_id": payload.get("file_persistent_id"),
+ }
+ )
+
+ document.source = None
+ document.parent_document = None
+ document.scielo_issn = None
+ document.pid_v2 = None
+ document.pid_v3 = None
+ document.pid_generic = dataset_doi
+ document.title = payload.get("title") or document.title
+ document.identifiers = compact_dict(
+ {
+ "dataset_doi": dataset_doi,
+ }
+ )
+ document.files = files
+ document.default_lang = document.default_lang
+ document.text_langs = document.text_langs or []
+ document.default_media_format = document.default_media_format
+ document.processing_date = document.processing_date
+ document.publication_date = payload.get("dataset_published") or document.publication_date
+ document.publication_year = normalize_year(
+ None,
+ fallback_date=document.publication_date,
+ )
+ document.extra_data = compact_dict(
+ {
+ "provider": "dataverse",
+ }
+ )
+
+ if user:
+ document.updated_by = user
+
+ document.save()
+ return document
diff --git a/document/services/preprints.py b/document/services/preprints.py
new file mode 100644
index 0000000..4be89f1
--- /dev/null
+++ b/document/services/preprints.py
@@ -0,0 +1,58 @@
+from document.models import Document
+
+from .common import compact_dict, normalize_langs, normalize_year
+
+
+def upsert_preprint_document(
+ payload,
+ collection,
+ user=None,
+ force_update=True,
+):
+ pid_generic = payload.get("pid_generic")
+ if not pid_generic:
+ return None
+
+ document, created = Document.objects.get_or_create(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_PREPRINT,
+ document_id=pid_generic,
+ )
+
+ if created and user:
+ document.creator = user
+
+ if created or force_update:
+ document.source = None
+ document.parent_document = None
+ document.scielo_issn = None
+ document.pid_v2 = None
+ document.pid_v3 = None
+ document.pid_generic = pid_generic
+ document.title = payload.get("title") or document.title
+ document.identifiers = compact_dict(
+ {
+ "pid_generic": pid_generic,
+ }
+ )
+ document.files = document.files or {}
+ document.default_lang = payload.get("default_language") or document.default_lang
+ document.text_langs = normalize_langs(payload.get("text_langs"))
+ document.default_media_format = document.default_media_format
+ document.processing_date = document.processing_date
+ document.publication_date = payload.get("publication_date") or document.publication_date
+ document.publication_year = normalize_year(
+ payload.get("publication_year"),
+ fallback_date=document.publication_date,
+ )
+ document.extra_data = compact_dict(
+ {
+ "provider": "preprints",
+ }
+ )
+
+ if user:
+ document.updated_by = user
+
+ document.save()
+ return document
diff --git a/document/tasks/__init__.py b/document/tasks/__init__.py
new file mode 100644
index 0000000..95a0ba5
--- /dev/null
+++ b/document/tasks/__init__.py
@@ -0,0 +1,28 @@
+from .articlemeta import (
+ load_documents_from_article_meta,
+ task_load_documents_from_article_meta,
+)
+from .common import (
+ get_latest_scielo_books_last_seq,
+)
+from .dataverse import (
+ load_dataset_metadata_from_dataverse,
+ task_load_dataset_metadata_into_documents,
+)
+from .opac import (
+ load_documents_from_opac,
+ task_load_documents_from_opac,
+)
+from .pipeline import (
+ task_daily_metadata_sync_pipeline,
+)
+from .preprints import (
+ load_preprints_from_preprints_api,
+ task_load_preprints_into_documents,
+)
+from .scielo_books import (
+ load_documents_from_scielo_books,
+ sync_documents_from_scielo_books,
+ task_load_documents_from_scielo_books,
+ task_sync_documents_from_scielo_books,
+)
diff --git a/document/tasks/articlemeta.py b/document/tasks/articlemeta.py
new file mode 100644
index 0000000..75b2689
--- /dev/null
+++ b/document/tasks/articlemeta.py
@@ -0,0 +1,120 @@
+import logging
+
+from django.db import DataError
+from django.utils.translation import gettext as _
+
+from core.collectors import articlemeta as articlemeta_collector
+from core.utils import date_utils
+from core.utils.request_utils import _get_user
+from document.services import articles as article_service
+from source.services import journals as journal_service
+
+from config import celery_app
+
+from .common import _get_collection
+
+
+def load_documents_from_article_meta(
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ collection=None,
+ issn=None,
+ force_update=True,
+ user=None,
+):
+ from_date, until_date = date_utils.get_date_range_str(
+ from_date,
+ until_date,
+ days_to_go_back,
+ )
+ logging.info(
+ "Loading documents from Article Meta. From: %s, Until: %s, Collection: %s, ISSN: %s",
+ from_date,
+ until_date,
+ collection,
+ issn,
+ )
+
+ offset = 0
+ limit = 1000
+ while True:
+ response = articlemeta_collector.fetch_article_counter_dict(
+ from_date,
+ until_date,
+ offset=offset,
+ limit=limit,
+ collection=collection,
+ issn=issn,
+ )
+ objects = response.get("objects") or []
+ if not objects:
+ break
+
+ for payload in objects:
+ collection_obj = _get_collection(payload.get("collection") or collection)
+ if not collection_obj:
+ logging.info(
+ "Collection not found for payload %s",
+ payload.get("code"),
+ )
+ continue
+
+ source = journal_service.find_journal_source_by_issns(
+ collection_obj,
+ payload.get("code_title"),
+ )
+ if not source:
+ logging.info(
+ "Source not found for collection %s and ISSNs %s",
+ collection_obj.acron3,
+ payload.get("code_title"),
+ )
+ continue
+
+ try:
+ article_service.upsert_article_document_from_articlemeta(
+ payload,
+ collection=collection_obj,
+ source=source,
+ user=user,
+ force_update=force_update,
+ )
+ except DataError as exc:
+ logging.error(
+ "Error saving Document from Article Meta. "
+ "Collection: %s, Source: %s, PIDv2: %s. Error: %s",
+ collection_obj,
+ source.source_id,
+ payload.get('code'),
+ exc
+ )
+ continue
+
+ offset += limit
+
+ return True
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (Article Meta)"), timelimit=-1, queue="load")
+def task_load_documents_from_article_meta(
+ self,
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ collection=None,
+ issn=None,
+ force_update=True,
+ user_id=None,
+ username=None,
+):
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return load_documents_from_article_meta(
+ from_date=from_date,
+ until_date=until_date,
+ days_to_go_back=days_to_go_back,
+ collection=collection,
+ issn=issn,
+ force_update=force_update,
+ user=user,
+ )
diff --git a/document/tasks/common.py b/document/tasks/common.py
new file mode 100644
index 0000000..1645918
--- /dev/null
+++ b/document/tasks/common.py
@@ -0,0 +1,43 @@
+import logging
+
+from collection.models import Collection
+from document.models import Document
+from source.models import Source
+
+
+def _get_collection(acronym):
+ if not acronym:
+ return None
+ return Collection.objects.filter(acron3=acronym).first()
+
+
+def get_latest_scielo_books_last_seq(collection="books"):
+ document_last_seq = _get_latest_last_seq_from_queryset(
+ Document.objects.filter(collection__acron3=collection).only("extra_data")
+ )
+ source_last_seq = _get_latest_last_seq_from_queryset(
+ Source.objects.filter(
+ collection__acron3=collection,
+ source_type=Source.SOURCE_TYPE_BOOK,
+ ).only("extra_data")
+ )
+ return max(document_last_seq, source_last_seq)
+
+
+def _get_latest_last_seq_from_queryset(queryset):
+ latest = 0
+ for item in queryset.iterator():
+ value = _coerce_last_seq((item.extra_data or {}).get("last_seq"))
+ if value is not None and value > latest:
+ latest = value
+ return latest
+
+
+def _coerce_last_seq(value):
+ if value in (None, ""):
+ return None
+ try:
+ return int(value)
+ except (TypeError, ValueError):
+ logging.warning("Ignoring invalid SciELO Books last_seq value: %r", value)
+ return None
diff --git a/document/tasks/dataverse.py b/document/tasks/dataverse.py
new file mode 100644
index 0000000..15618a5
--- /dev/null
+++ b/document/tasks/dataverse.py
@@ -0,0 +1,80 @@
+import logging
+
+from django.db import DataError
+from django.utils.translation import gettext as _
+
+from core.collectors import dataverse as dataverse_collector
+from core.utils import date_utils
+from core.utils.request_utils import _get_user
+from document.services import datasets as dataset_service
+
+from config import celery_app
+
+from .common import _get_collection
+
+
+def load_dataset_metadata_from_dataverse(
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ force_update=True,
+ user=None,
+):
+ from_date, until_date = date_utils.get_date_range_str(
+ from_date,
+ until_date,
+ days_to_go_back,
+ )
+ logging.info(
+ "Loading dataset metadata into documents. From: %s, Until: %s",
+ from_date,
+ until_date,
+ )
+
+ collection_obj = _get_collection("data")
+ if not collection_obj:
+ logging.error("Collection not found: data")
+ return False
+
+ for payload in dataverse_collector.iter_dataset_metadata(from_date, until_date):
+ if not payload.get("dataset_doi"):
+ logging.error("Dataset DOI not found in record: %s", payload)
+ continue
+
+ try:
+ dataset_service.upsert_dataset_document(
+ payload,
+ collection=collection_obj,
+ user=user,
+ force_update=force_update,
+ )
+ except DataError as exc:
+ logging.error(
+ "Error saving Dataset Document. Collection: %s, PID: %s. Error: %s",
+ collection_obj,
+ payload.get('dataset_doi'),
+ exc
+ )
+ continue
+
+ return True
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (Dataverse)"), timelimit=-1, queue="load")
+def task_load_dataset_metadata_into_documents(
+ self,
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ force_update=True,
+ user_id=None,
+ username=None,
+):
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return load_dataset_metadata_from_dataverse(
+ from_date=from_date,
+ until_date=until_date,
+ days_to_go_back=days_to_go_back,
+ force_update=force_update,
+ user=user,
+ )
diff --git a/document/tasks/opac.py b/document/tasks/opac.py
new file mode 100644
index 0000000..5e1c81e
--- /dev/null
+++ b/document/tasks/opac.py
@@ -0,0 +1,107 @@
+import logging
+
+from django.db import DataError
+from django.utils.translation import gettext as _
+
+from core.collectors import opac as opac_collector
+from core.utils import date_utils
+from core.utils.request_utils import _get_user
+from document.services import articles as article_service
+from source.services import journals as journal_service
+
+from config import celery_app
+
+from .common import _get_collection
+
+
+def load_documents_from_opac(
+ collection="scl",
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ page=1,
+ force_update=True,
+ user=None,
+):
+ from_date, until_date = date_utils.get_date_range_str(
+ from_date,
+ until_date,
+ days_to_go_back,
+ )
+ logging.info(
+ "Loading documents from OPAC. From: %s, Until: %s, Collection: %s",
+ from_date,
+ until_date,
+ collection,
+ )
+
+ collection_obj = _get_collection(collection)
+ if not collection_obj:
+ logging.error("Collection not found: %s", collection)
+ return False
+
+ while True:
+ response = opac_collector.fetch_counter_dict(from_date, until_date, page=page)
+ documents = response.get("documents") or {}
+
+ for payload in documents.values():
+ source = journal_service.find_journal_source_by_acronym(
+ collection_obj,
+ payload.get("journal_acronym"),
+ )
+ if not source:
+ logging.info(
+ "Source not found for collection %s and acronym %s",
+ collection_obj.acron3,
+ payload.get("journal_acronym"),
+ )
+ continue
+
+ try:
+ article_service.upsert_article_document_from_opac(
+ payload,
+ collection=collection_obj,
+ source=source,
+ user=user,
+ force_update=force_update,
+ )
+ except DataError as exc:
+ logging.error(
+ "Error saving Document from OPAC. "
+ "Collection: %s, Source: %s, PIDv2: %s. Error: %s",
+ collection_obj,
+ source.source_id,
+ payload.get('pid_v2'),
+ exc
+ )
+ continue
+
+ page += 1
+ if page > int(response.get("pages", 0)):
+ break
+
+ return True
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (OPAC)"), timelimit=-1, queue="load")
+def task_load_documents_from_opac(
+ self,
+ collection="scl",
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ page=1,
+ force_update=True,
+ user_id=None,
+ username=None,
+):
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return load_documents_from_opac(
+ collection=collection,
+ from_date=from_date,
+ until_date=until_date,
+ days_to_go_back=days_to_go_back,
+ page=page,
+ force_update=force_update,
+ user=user,
+ )
diff --git a/document/tasks/pipeline.py b/document/tasks/pipeline.py
new file mode 100644
index 0000000..97bef7c
--- /dev/null
+++ b/document/tasks/pipeline.py
@@ -0,0 +1,24 @@
+import logging
+
+from celery import group
+from django.utils.translation import gettext as _
+
+from config import celery_app
+
+from .articlemeta import task_load_documents_from_article_meta
+from .dataverse import task_load_dataset_metadata_into_documents
+from .opac import task_load_documents_from_opac
+from .preprints import task_load_preprints_into_documents
+from .scielo_books import task_sync_documents_from_scielo_books
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Daily Sync Routine (Auto)"), queue="load")
+def task_daily_metadata_sync_pipeline(self):
+ logging.info("Starting Daily Metadata Sync Pipeline")
+ group([
+ task_load_documents_from_article_meta.s(),
+ task_load_documents_from_opac.s(),
+ task_load_preprints_into_documents.s(),
+ task_load_dataset_metadata_into_documents.s(),
+ task_sync_documents_from_scielo_books.s(),
+ ]).apply_async()
diff --git a/document/tasks/preprints.py b/document/tasks/preprints.py
new file mode 100644
index 0000000..ee63211
--- /dev/null
+++ b/document/tasks/preprints.py
@@ -0,0 +1,82 @@
+import logging
+
+from django.db import DataError
+from django.utils.translation import gettext as _
+
+from core.collectors import preprints as preprints_collector
+from core.utils import date_utils
+from core.utils.request_utils import _get_user
+from document.services import preprints as preprint_service
+
+from config import celery_app
+
+from .common import _get_collection
+
+
+def load_preprints_from_preprints_api(
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ force_update=True,
+ user=None,
+):
+ from_date, until_date = date_utils.get_date_range_str(
+ from_date,
+ until_date,
+ days_to_go_back,
+ )
+ logging.info(
+ "Loading preprints into documents. From: %s, Until: %s",
+ from_date,
+ until_date,
+ )
+
+ collection_obj = _get_collection("preprints")
+ if not collection_obj:
+ logging.error("Collection not found: preprints")
+ return False
+
+ for record in preprints_collector.iter_records(from_date, until_date):
+ payload = preprints_collector.extract_record_data(record)
+
+ if not payload.get("pid_generic"):
+ logging.error("Preprint ID not found in record: %s", record)
+ continue
+
+ try:
+ preprint_service.upsert_preprint_document(
+ payload,
+ collection=collection_obj,
+ user=user,
+ force_update=force_update,
+ )
+ except DataError as exc:
+ logging.error(
+ "Error saving Preprint Document. Collection: %s, PID: %s. Error: %s",
+ collection_obj,
+ payload.get('pid_generic'),
+ exc
+ )
+ continue
+
+ return True
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (Preprints)"), timelimit=-1, queue="load")
+def task_load_preprints_into_documents(
+ self,
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ force_update=True,
+ user_id=None,
+ username=None,
+):
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return load_preprints_from_preprints_api(
+ from_date=from_date,
+ until_date=until_date,
+ days_to_go_back=days_to_go_back,
+ force_update=force_update,
+ user=user,
+ )
diff --git a/document/tasks/scielo_books.py b/document/tasks/scielo_books.py
new file mode 100644
index 0000000..ddbd462
--- /dev/null
+++ b/document/tasks/scielo_books.py
@@ -0,0 +1,247 @@
+import logging
+
+from django.conf import settings
+from django.utils.translation import gettext as _
+
+from core.collectors import scielo_books as scielo_books_collector
+from core.utils.request_utils import _get_user
+from document.services import books as document_books_service
+from source.services import books as source_books_service
+
+from config import celery_app
+
+from .common import get_latest_scielo_books_last_seq
+
+
+def load_documents_from_scielo_books(
+ collection="books",
+ db_name=None,
+ since=0,
+ limit=None,
+ force_update=True,
+ headers=None,
+ base_url=None,
+ user=None,
+):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ limit = limit or settings.SCIELO_BOOKS_LIMIT
+ collection_obj = source_books_service.get_books_collection(collection)
+ monograph_cache = {}
+
+ logging.info(
+ "Loading documents from SciELO Books. Collection: %s, DB: %s, Since: %s, Limit: %s",
+ collection,
+ db_name,
+ since,
+ limit,
+ )
+
+ for item in scielo_books_collector.iter_change_documents(
+ base_url=base_url,
+ db_name=db_name,
+ since=since,
+ limit=limit,
+ headers=headers,
+ ):
+ change = item["change"]
+ raw_id = change.get("id")
+
+ if item["deleted"]:
+ delete_source = document_books_service.has_monograph_document_for_raw_id(
+ collection_obj,
+ raw_id,
+ )
+ document_books_service.delete_document_by_raw_id(collection_obj, raw_id)
+ if delete_source:
+ source_books_service.delete_book_source(collection_obj, raw_id)
+ continue
+
+ payload = item["payload"] or {}
+ source_url = item.get("source_url")
+ last_seq = change.get("seq")
+
+ if payload.get("TYPE") == "Monograph":
+ source = source_books_service.upsert_monograph_source(
+ payload,
+ collection=collection_obj,
+ user=user,
+ force_update=force_update,
+ source_url=source_url,
+ last_seq=last_seq,
+ )
+ document_books_service.upsert_monograph_document(
+ payload,
+ collection=collection_obj,
+ source=source,
+ user=user,
+ force_update=force_update,
+ source_url=source_url,
+ last_seq=last_seq,
+ )
+ monograph_cache[str(payload.get("id"))] = payload
+ continue
+
+ if payload.get("TYPE") != "Part":
+ continue
+
+ monograph_payload = _get_monograph_payload(
+ payload,
+ monograph_cache=monograph_cache,
+ base_url=base_url,
+ db_name=db_name,
+ headers=headers,
+ )
+ if not monograph_payload:
+ logging.warning(
+ "Skipping part %s because monograph %s could not be loaded.",
+ payload.get("id"),
+ payload.get("monograph"),
+ )
+ continue
+
+ source = source_books_service.upsert_monograph_source(
+ monograph_payload,
+ collection=collection_obj,
+ user=user,
+ force_update=force_update,
+ source_url=None,
+ last_seq=last_seq,
+ )
+ parent_document = document_books_service.upsert_monograph_document(
+ monograph_payload,
+ collection=collection_obj,
+ source=source,
+ user=user,
+ force_update=force_update,
+ source_url=None,
+ last_seq=last_seq,
+ )
+ enriched_payload = document_books_service.enrich_part_payload(
+ payload,
+ monograph_payload,
+ )
+ document_books_service.upsert_part_document(
+ enriched_payload,
+ collection=collection_obj,
+ source=source,
+ parent_document=parent_document,
+ user=user,
+ force_update=force_update,
+ source_url=source_url,
+ last_seq=last_seq,
+ )
+
+ return True
+
+
+def sync_documents_from_scielo_books(
+ collection="books",
+ db_name=None,
+ limit=None,
+ force_update=True,
+ headers=None,
+ base_url=None,
+ user=None,
+):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ limit = limit or settings.SCIELO_BOOKS_LIMIT
+ since = get_latest_scielo_books_last_seq(collection=collection)
+ logging.info(
+ "Syncing documents from SciELO Books incrementally. Collection: %s, Since: %s, Limit: %s",
+ collection,
+ since,
+ limit,
+ )
+ return load_documents_from_scielo_books(
+ collection=collection,
+ db_name=db_name,
+ since=since,
+ limit=limit,
+ force_update=force_update,
+ headers=headers,
+ base_url=base_url,
+ user=user,
+ )
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (SciELO Books - Manual)"), queue="load")
+def task_load_documents_from_scielo_books(
+ self,
+ collection="books",
+ db_name=None,
+ since=0,
+ limit=None,
+ force_update=True,
+ headers=None,
+ base_url=None,
+ user_id=None,
+ username=None,
+):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ limit = limit or settings.SCIELO_BOOKS_LIMIT
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return load_documents_from_scielo_books(
+ collection=collection,
+ db_name=db_name,
+ since=since,
+ limit=limit,
+ force_update=force_update,
+ headers=headers,
+ base_url=base_url,
+ user=user,
+ )
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (SciELO Books - Incremental)"), queue="load")
+def task_sync_documents_from_scielo_books(
+ self,
+ collection="books",
+ db_name=None,
+ limit=None,
+ force_update=True,
+ headers=None,
+ base_url=None,
+ user_id=None,
+ username=None,
+):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ limit = limit or settings.SCIELO_BOOKS_LIMIT
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return sync_documents_from_scielo_books(
+ collection=collection,
+ db_name=db_name,
+ limit=limit,
+ force_update=force_update,
+ headers=headers,
+ base_url=base_url,
+ user=user,
+ )
+
+
+def _get_monograph_payload(payload, monograph_cache, base_url=None, db_name=None, headers=None):
+ monograph_id = payload.get("monograph")
+ if not monograph_id:
+ return None
+
+ monograph_key = str(monograph_id)
+ if monograph_key in monograph_cache:
+ return monograph_cache[monograph_key]
+
+ try:
+ monograph_payload, _ = scielo_books_collector.fetch_document(
+ doc_id=monograph_id,
+ base_url=base_url,
+ db_name=db_name or settings.SCIELO_BOOKS_DB_NAME,
+ headers=headers,
+ )
+ except Exception as exc:
+ logging.warning(
+ "Failed to fetch monograph %s for part %s: %s",
+ monograph_id,
+ payload.get("id"),
+ exc,
+ )
+ return None
+
+ monograph_cache[monograph_key] = monograph_payload
+ return monograph_payload
diff --git a/document/tests.py b/document/tests.py
new file mode 100644
index 0000000..14d9bcd
--- /dev/null
+++ b/document/tests.py
@@ -0,0 +1,255 @@
+from django.test import TestCase
+from unittest.mock import patch
+
+from collection.models import Collection
+from document import tasks as document_tasks
+from source.services import books as source_books_service
+from source.models import Source
+
+from .models import Document
+from .services import articles as article_service
+from .services import books as books_service
+from .services import datasets as dataset_service
+from .services import preprints as preprint_service
+
+
+class DocumentMetadataTests(TestCase):
+ def test_metadata_includes_source_context_and_legacy_identifiers(self):
+ collection = Collection.objects.create(acron3="scl", acron2="sc")
+ source = Source.objects.create(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_JOURNAL,
+ source_id="1234-5678",
+ scielo_issn="1234-5678",
+ title="Test Journal",
+ identifiers={"scielo_issn": "1234-5678"},
+ )
+ Document.objects.create(
+ collection=collection,
+ source=source,
+ document_type=Document.DOCUMENT_TYPE_ARTICLE,
+ document_id="S123456782024000100001",
+ scielo_issn="1234-5678",
+ pid_v2="S123456782024000100001",
+ pid_v3="abc123",
+ title="Test Article",
+ identifiers={"doi": "10.1590/example"},
+ files={"pt": {"path": "/pdf/test.pdf"}},
+ default_lang="en",
+ text_langs=["en", "pt"],
+ publication_date="2024-01-15",
+ publication_year="2024",
+ )
+
+ metadata = list(Document.metadata(collection=collection))
+
+ self.assertEqual(len(metadata), 1)
+ self.assertEqual(metadata[0]["document_type"], Document.DOCUMENT_TYPE_ARTICLE)
+ self.assertEqual(metadata[0]["document_id"], "S123456782024000100001")
+ self.assertEqual(metadata[0]["source_type"], Source.SOURCE_TYPE_JOURNAL)
+ self.assertEqual(metadata[0]["source_id"], "1234-5678")
+ self.assertEqual(metadata[0]["scielo_issn"], "1234-5678")
+
+ def test_upsert_monograph_and_part_documents_from_books_payload(self):
+ collection = Collection.objects.create(acron3="books", acron2="bk")
+ monograph_payload = {
+ "TYPE": "Monograph",
+ "id": "abcd1",
+ "title": "Sample Book",
+ "isbn": "9788578791889",
+ "eisbn": "9788578791880",
+ "doi_number": "10.1234/book",
+ "language": "pt",
+ "publication_date": "2024-05-20",
+ "year": "2024",
+ "publisher": "SciELO Books",
+ }
+ part_payload = {
+ "TYPE": "Part",
+ "id": "18",
+ "monograph": "abcd1",
+ "title": "Chapter 18",
+ "text_language": "es",
+ "order": "18",
+ }
+
+ source = source_books_service.upsert_monograph_source(
+ monograph_payload,
+ collection=collection,
+ )
+ parent_document = books_service.upsert_monograph_document(
+ monograph_payload,
+ collection=collection,
+ source=source,
+ )
+ chapter = books_service.upsert_part_document(
+ books_service.enrich_part_payload(part_payload, monograph_payload),
+ collection=collection,
+ source=source,
+ parent_document=parent_document,
+ )
+
+ self.assertEqual(parent_document.document_type, Document.DOCUMENT_TYPE_BOOK)
+ self.assertEqual(parent_document.document_id, "book:abcd1")
+ self.assertEqual(parent_document.pid_generic, "book:abcd1")
+ self.assertEqual(chapter.document_type, Document.DOCUMENT_TYPE_CHAPTER)
+ self.assertEqual(chapter.document_id, "book:abcd1/chapter:18")
+ self.assertEqual(chapter.parent_document, parent_document)
+ self.assertEqual(chapter.identifiers["book_id"], "abcd1")
+ self.assertEqual(chapter.default_lang, "es")
+
+ def test_articlemeta_and_opac_upsert_same_document(self):
+ collection = Collection.objects.create(acron3="scl", acron2="sc")
+ source = Source.objects.create(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_JOURNAL,
+ source_id="1234-5678",
+ scielo_issn="1234-5678",
+ acronym="testjou",
+ title="Test Journal",
+ identifiers={"scielo_issn": "1234-5678"},
+ )
+
+ first = article_service.upsert_article_document_from_articlemeta(
+ {
+ "code": "S123456782024000100001",
+ "title": "Article Title",
+ "pdfs": {"en": {"url": "/pdf/en.pdf"}},
+ "processing_date": "2024-02-10",
+ "publication_date": "2024-01-15",
+ "publication_year": "2024",
+ "default_language": "en",
+ "text_langs": ["en", "pt"],
+ "code_title": ["1234-5678"],
+ },
+ collection=collection,
+ source=source,
+ )
+ second = article_service.upsert_article_document_from_opac(
+ {
+ "pid_v2": "S123456782024000100001",
+ "pid_v3": "S1234-56782024000100001",
+ "title": "Article Title",
+ "journal_acronym": "testjou",
+ "publication_date": "2024-01-15",
+ "default_language": "en",
+ "text_langs": ["en", "pt"],
+ },
+ collection=collection,
+ source=source,
+ )
+
+ self.assertEqual(first.pk, second.pk)
+ self.assertEqual(Document.objects.count(), 1)
+ second.refresh_from_db()
+ self.assertEqual(second.pid_v3, "S1234-56782024000100001")
+ self.assertEqual(second.identifiers["journal_acronym"], "testjou")
+
+ def test_upsert_preprint_document_maps_metadata(self):
+ collection = Collection.objects.create(acron3="preprints", acron2="pp")
+
+ document = preprint_service.upsert_preprint_document(
+ {
+ "pid_generic": "preprint/123",
+ "title": "Preprint Title",
+ "text_langs": ["en", "pt"],
+ "default_language": "en",
+ "publication_date": "2024-01-20",
+ "publication_year": "2024",
+ },
+ collection=collection,
+ )
+
+ self.assertEqual(document.document_type, Document.DOCUMENT_TYPE_PREPRINT)
+ self.assertEqual(document.document_id, "preprint/123")
+ self.assertEqual(document.pid_generic, "preprint/123")
+ self.assertEqual(document.default_lang, "en")
+
+ def test_upsert_dataset_document_accumulates_files(self):
+ collection = Collection.objects.create(acron3="data", acron2="dt")
+
+ dataset_service.upsert_dataset_document(
+ {
+ "title": "Dataset Title",
+ "dataset_doi": "10.1234/dataset",
+ "dataset_published": "2024-03-15",
+ "file_id": "1",
+ "file_name": "first.csv",
+ "file_url": "https://example.org/first.csv",
+ "file_persistent_id": "pid:first",
+ },
+ collection=collection,
+ )
+ document = dataset_service.upsert_dataset_document(
+ {
+ "title": "Dataset Title",
+ "dataset_doi": "10.1234/dataset",
+ "dataset_published": "2024-03-15",
+ "file_id": "2",
+ "file_name": "second.csv",
+ "file_url": "https://example.org/second.csv",
+ "file_persistent_id": "pid:second",
+ },
+ collection=collection,
+ )
+
+ self.assertEqual(document.document_type, Document.DOCUMENT_TYPE_DATASET)
+ self.assertEqual(document.document_id, "10.1234/dataset")
+ self.assertEqual(set(document.files.keys()), {"1", "2"})
+
+
+class DocumentBooksSyncTests(TestCase):
+ def test_get_latest_scielo_books_last_seq_uses_documents_and_sources(self):
+ collection = Collection.objects.create(acron3="books", acron2="bk")
+ source = Source.objects.create(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_BOOK,
+ source_id="book-1",
+ title="Book 1",
+ extra_data={"last_seq": 120},
+ )
+ Document.objects.create(
+ collection=collection,
+ source=source,
+ document_type=Document.DOCUMENT_TYPE_BOOK,
+ document_id="book:book-1",
+ extra_data={"last_seq": "135"},
+ )
+
+ self.assertEqual(document_tasks.get_latest_scielo_books_last_seq("books"), 135)
+
+ def test_sync_documents_from_scielo_books_uses_computed_since(self):
+ collection = Collection.objects.create(acron3="books", acron2="bk")
+ source = Source.objects.create(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_BOOK,
+ source_id="book-1",
+ title="Book 1",
+ extra_data={"last_seq": 120},
+ )
+ Document.objects.create(
+ collection=collection,
+ source=source,
+ document_type=Document.DOCUMENT_TYPE_BOOK,
+ document_id="book:book-1",
+ extra_data={"last_seq": 135},
+ )
+
+ with patch("document.tasks.scielo_books.load_documents_from_scielo_books", return_value=True) as mocked:
+ result = document_tasks.sync_documents_from_scielo_books(
+ collection="books",
+ db_name="scielobooks_1a",
+ limit=500,
+ )
+
+ self.assertTrue(result)
+ mocked.assert_called_once_with(
+ collection="books",
+ db_name="scielobooks_1a",
+ since=135,
+ limit=500,
+ force_update=True,
+ headers=None,
+ base_url=None,
+ user=None,
+ )
diff --git a/article/wagtail_hooks.py b/document/wagtail_hooks.py
similarity index 50%
rename from article/wagtail_hooks.py
rename to document/wagtail_hooks.py
index 4cf55bd..de291c9 100644
--- a/article/wagtail_hooks.py
+++ b/document/wagtail_hooks.py
@@ -1,39 +1,35 @@
from django.utils.translation import gettext_lazy as _
from wagtail.snippets.views.snippets import SnippetViewSet
-from wagtail.snippets.models import register_snippet
-from config.menu import get_menu_order
+from .models import Document
-from .models import Article
-
-class ArticleSnippetViewSet(SnippetViewSet):
- model = Article
+class DocumentSnippetViewSet(SnippetViewSet):
+ model = Document
icon = "folder-open-inverse"
- menu_name = "article"
- menu_label = _("Article")
- menu_order = get_menu_order("article")
- add_to_admin_menu = True
+ menu_label = _("Document")
+ menu_order = 300
list_display = (
"collection",
- "scielo_issn",
+ "document_type",
+ "document_id",
+ "source",
+ "title",
"pid_v2",
"pid_v3",
"pid_generic",
- "files",
"publication_year",
)
list_filter = (
"collection",
- "scielo_issn",
+ "document_type",
"publication_year",
)
search_fields = (
- "scielo_issn",
+ "document_id",
+ "title",
"pid_v2",
"pid_v3",
"pid_generic",
)
-
-register_snippet(ArticleSnippetViewSet)
diff --git a/journal/__init__.py b/journal/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/journal/admin.py b/journal/admin.py
deleted file mode 100644
index 8c38f3f..0000000
--- a/journal/admin.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.contrib import admin
-
-# Register your models here.
diff --git a/journal/migrations/0001_initial.py b/journal/migrations/0001_initial.py
deleted file mode 100644
index 7164bbc..0000000
--- a/journal/migrations/0001_initial.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Generated by Django 5.0.7 on 2025-02-07 17:50
-
-import django.db.models.deletion
-from django.conf import settings
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- initial = True
-
- dependencies = [
- ("collection", "0001_initial"),
- migrations.swappable_dependency(settings.AUTH_USER_MODEL),
- ]
-
- operations = [
- migrations.CreateModel(
- name="Journal",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
- ),
- (
- "updated",
- models.DateTimeField(
- auto_now=True, verbose_name="Last update date"
- ),
- ),
- (
- "scielo_issn",
- models.CharField(max_length=9, verbose_name="SciELO ISSN"),
- ),
- (
- "issns",
- models.JSONField(
- blank=True, default=dict, null=True, verbose_name="ISSNs"
- ),
- ),
- (
- "acronym",
- models.CharField(
- blank=True,
- default="",
- max_length=32,
- null=True,
- verbose_name="Journal Acronym",
- ),
- ),
- (
- "title",
- models.CharField(max_length=255, verbose_name="Journal Title"),
- ),
- (
- "publisher_name",
- models.JSONField(
- blank=True,
- default=list,
- null=True,
- verbose_name="Publisher Name",
- ),
- ),
- (
- "subject_areas",
- models.JSONField(
- default=list, verbose_name="Subject Areas (CAPES)"
- ),
- ),
- (
- "wos_subject_areas",
- models.JSONField(default=list, verbose_name="Subject Areas (WoS)"),
- ),
- (
- "collection",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="collection.collection",
- verbose_name="Collection",
- ),
- ),
- (
- "creator",
- models.ForeignKey(
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_creator",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Creator",
- ),
- ),
- (
- "updated_by",
- models.ForeignKey(
- blank=True,
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_last_mod_user",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Updater",
- ),
- ),
- ],
- options={
- "verbose_name": "Journal",
- "verbose_name_plural": "Journals",
- "unique_together": {("collection", "scielo_issn", "acronym")},
- },
- ),
- ]
diff --git a/journal/migrations/0002_alter_journal_scielo_issn.py b/journal/migrations/0002_alter_journal_scielo_issn.py
deleted file mode 100644
index 07cf94f..0000000
--- a/journal/migrations/0002_alter_journal_scielo_issn.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Generated by Django 5.0.7 on 2025-06-12 17:16
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("journal", "0001_initial"),
- ]
-
- operations = [
- migrations.AlterField(
- model_name="journal",
- name="scielo_issn",
- field=models.CharField(
- db_index=True, max_length=9, verbose_name="SciELO ISSN"
- ),
- ),
- ]
diff --git a/journal/migrations/__init__.py b/journal/migrations/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/journal/models.py b/journal/models.py
deleted file mode 100644
index 0d830e9..0000000
--- a/journal/models.py
+++ /dev/null
@@ -1,100 +0,0 @@
-from django.db import models
-from django.utils.translation import gettext_lazy as _
-
-from core.models import CommonControlField
-from collection.models import Collection
-
-
-class Journal(CommonControlField):
- collection = models.ForeignKey(
- Collection,
- verbose_name=_('Collection'),
- on_delete=models.CASCADE,
- blank=False,
- null=False,
- db_index=True,
- )
-
- scielo_issn = models.CharField(
- verbose_name=_('SciELO ISSN'),
- max_length=9,
- blank=False,
- null=False,
- db_index=True,
- )
-
- issns = models.JSONField(
- verbose_name=_('ISSNs'),
- null=True,
- blank=True,
- default=dict,
- )
-
- acronym = models.CharField(
- verbose_name=_('Journal Acronym'),
- max_length=32,
- blank=True,
- null=True,
- default='',
- )
-
- title = models.CharField(
- verbose_name=_('Journal Title'),
- max_length=255,
- blank=False,
- null=False,
- )
-
- publisher_name = models.JSONField(
- verbose_name=_('Publisher Name'),
- blank=True,
- null=True,
- default=list,
- )
-
- subject_areas = models.JSONField(
- verbose_name=_('Subject Areas (CAPES)'),
- null=False,
- blank=False,
- default=list,
- )
-
- wos_subject_areas = models.JSONField(
- verbose_name=_('Subject Areas (WoS)'),
- null=False,
- blank=False,
- default=list,
- )
-
- def __str__(self):
- return f'{self.collection.acron2} - {self.scielo_issn} - {self.acronym}'
-
- @classmethod
- def metadata(cls, collection=None):
- queryset = cls.objects.all()
- if collection:
- queryset = queryset.filter(collection=collection)
-
- for journal in queryset.only(
- 'acronym', 'collection__acron3', 'issns', 'publisher_name',
- 'scielo_issn', 'subject_areas', 'title', 'wos_subject_areas'
- ):
- yield {
- 'acronym': journal.acronym,
- 'collection': journal.collection.acron3,
- 'issns': set([v for v in journal.issns.values() if v]),
- 'publisher_name': journal.publisher_name,
- 'scielo_issn': journal.scielo_issn,
- 'subject_areas': journal.subject_areas,
- 'title': journal.title,
- 'wos_subject_areas': journal.wos_subject_areas,
- }
-
- class Meta:
- verbose_name = _('Journal')
- verbose_name_plural = _('Journals')
- unique_together = (
- 'collection',
- 'scielo_issn',
- 'acronym',
- )
diff --git a/journal/tasks.py b/journal/tasks.py
deleted file mode 100644
index 71681cb..0000000
--- a/journal/tasks.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import logging
-
-from django.contrib.auth import get_user_model
-from django.db import IntegrityError
-from django.utils import timezone
-from django.utils.translation import gettext as _
-
-from collection.models import Collection
-from config import celery_app
-from core.utils.utils import _get_user
-
-from . import models, utils
-
-
-User = get_user_model()
-
-
-@celery_app.task(bind=True, name=_('Load journal data from Article Meta'), queue='load')
-def task_load_journal_data_from_article_meta(self, collections=[], force_update=True, user_id=None, username=None, mode='thrift'):
- user = _get_user(user_id, username)
-
- for col in collections or Collection.acron3_list():
- for j in utils.fetch_article_meta_journals(collection=col, mode=mode):
- collection = Collection.objects.get(acron3=j.collection_acronym)
- if not collection:
- logging.error(f'Collection {j.collection_acronym} does not exist')
- continue
-
- try:
- journal, created = models.Journal.objects.get_or_create(collection=collection, scielo_issn=j.scielo_issn)
- except IntegrityError as e:
- logging.error(f'Journal {j} has not been created due to error: {e}')
- continue
-
- if created:
- journal.creator = user
- journal.created = timezone.now()
-
- if created or force_update:
- journal.updated_by = user
- journal.updated = timezone.now()
- journal.issns = {
- 'electronic_issn': j.electronic_issn or '',
- 'print_issn': j.print_issn or '',
- 'scielo_issn': j.scielo_issn
- }
- journal.acronym = j.acronym
- journal.title = j.title
- journal.publisher_name = j.publisher_name or ''
- journal.subject_areas = j.subject_areas or []
- journal.wos_subject_areas = j.wos_subject_areas or []
- logging.info(f'Journal {"created" if created else "updated"}: {journal}')
-
- journal.save()
-
- return True
diff --git a/journal/tests.py b/journal/tests.py
deleted file mode 100644
index 7ce503c..0000000
--- a/journal/tests.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.test import TestCase
-
-# Create your tests here.
diff --git a/journal/utils.py b/journal/utils.py
deleted file mode 100644
index 8a80521..0000000
--- a/journal/utils.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from articlemeta.client import ThriftClient, RestfulClient
-
-
-def fetch_article_meta_journals(collection='scl', mode='rest'):
- """
- Fetches article metadata from journals.
-
- Returns
- -------
- list
- A list of article metadata.
- """
- if mode == 'rest':
- am = RestfulClient()
- elif mode == 'thrift':
- am = ThriftClient()
-
- for j in am.journals(collection=collection):
- yield j
diff --git a/journal/views.py b/journal/views.py
deleted file mode 100644
index 91ea44a..0000000
--- a/journal/views.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.shortcuts import render
-
-# Create your views here.
diff --git a/journal/wagtail_hooks.py b/journal/wagtail_hooks.py
deleted file mode 100644
index 725b370..0000000
--- a/journal/wagtail_hooks.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from django.utils.translation import gettext_lazy as _
-from wagtail.snippets.views.snippets import SnippetViewSet
-from wagtail.snippets.models import register_snippet
-
-from config.menu import get_menu_order
-
-from .models import Journal
-
-
-class JournalSnippetViewSet(SnippetViewSet):
- model = Journal
- icon = "folder-open-inverse"
- menu_name = "journal"
- menu_label = _("Journal")
- menu_order = get_menu_order('journal')
- add_to_admin_menu = True
-
- list_display = (
- "collection",
- "scielo_issn",
- "acronym",
- "title",
- "issns",
- "publisher_name",
- "subject_areas",
- "wos_subject_areas",
- )
- list_filter = (
- "collection",
- )
- search_fields = (
- "issns",
- "acronym",
- "publisher_name",
- "subject_areas",
- "wos_subject_areas",
- )
-
-
-register_snippet(JournalSnippetViewSet)
diff --git a/log_manager/choices.py b/log_manager/choices.py
index e98c8f2..c6e461a 100644
--- a/log_manager/choices.py
+++ b/log_manager/choices.py
@@ -19,13 +19,3 @@
(LOG_FILE_STATUS_IGNORED, _("Ignored")),
]
-
-COLLECTION_LOG_FILE_DATE_COUNT_OK = 'OK'
-COLLECTION_LOG_FILE_DATE_COUNT_MISSING_FILES = 'MIS'
-COLLECTION_LOG_FILE_DATE_COUNT_EXTRA_FILES = 'EXT'
-
-COLLECTION_LOG_FILE_DATE_COUNT = [
- (COLLECTION_LOG_FILE_DATE_COUNT_OK, _("OK")),
- (COLLECTION_LOG_FILE_DATE_COUNT_MISSING_FILES, _("Missing Files")),
- (COLLECTION_LOG_FILE_DATE_COUNT_EXTRA_FILES, _("Extra files")),
-]
diff --git a/log_manager/migrations/0010_alter_logfiledate_unique_together_and_more.py b/log_manager/migrations/0010_alter_logfiledate_unique_together_and_more.py
new file mode 100644
index 0000000..d30cdf4
--- /dev/null
+++ b/log_manager/migrations/0010_alter_logfiledate_unique_together_and_more.py
@@ -0,0 +1,52 @@
+# Generated by Django 5.2.12 on 2026-05-01 22:23
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("log_manager", "0009_collectionlogfiledatecount_exported_files_count"),
+ ]
+
+ operations = [
+ migrations.RemoveField(
+ model_name="logfiledate",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="logfiledate",
+ name="log_file",
+ ),
+ migrations.RemoveField(
+ model_name="logfiledate",
+ name="updated_by",
+ ),
+ migrations.RemoveField(
+ model_name="logfile",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="logfile",
+ name="updated_by",
+ ),
+ migrations.AddField(
+ model_name="logfile",
+ name="date",
+ field=models.DateField(
+ blank=True, db_index=True, null=True, verbose_name="Date"
+ ),
+ ),
+ migrations.AddField(
+ model_name="logfile",
+ name="parse_heartbeat_at",
+ field=models.DateTimeField(
+ blank=True, null=True, verbose_name="Parse Heartbeat At"
+ ),
+ ),
+ migrations.DeleteModel(
+ name="CollectionLogFileDateCount",
+ ),
+ migrations.DeleteModel(
+ name="LogFileDate",
+ ),
+ ]
diff --git a/log_manager/models.py b/log_manager/models.py
index fc3a8b6..6bf04d8 100644
--- a/log_manager/models.py
+++ b/log_manager/models.py
@@ -1,209 +1,20 @@
import logging
-from django.db import models
-from django.db.models import Q
+from django.db import IntegrityError, models
from django.utils import timezone
from django.utils.translation import gettext_lazy as _
from wagtail.admin.panels import FieldPanel
from wagtailautocomplete.edit_handlers import AutocompletePanel
from collection.models import Collection
-from core.forms import CoreAdminModelForm
-from core.models import CommonControlField
from . import choices
-class LogFileDate(CommonControlField):
- date = models.DateField(
- verbose_name=_("Date"),
- null=False,
- blank=False,
- db_index=True,
- )
-
- log_file = models.ForeignKey(
- 'LogFile',
- verbose_name=_('Log File'),
- blank=True,
- on_delete=models.DO_NOTHING,
- db_index=True,
- )
-
- base_form_class = CoreAdminModelForm
-
- panel = [
- FieldPanel('date'),
- AutocompletePanel('log_file')
- ]
-
- class Meta:
- ordering = ['-date']
- verbose_name = _("Log File Date")
- verbose_name_plural = _("Log File Dates")
- unique_together = (
- 'date',
- 'log_file',
- )
- indexes = [
- models.Index(fields=['date', 'log_file']),
- ]
-
- @classmethod
- def create_or_update(cls, user, log_file, date):
- obj, created = cls.objects.get_or_create(
- log_file=log_file,
- date=date,
- )
-
- if not created:
- obj.updated_by = user
- obj.updated = timezone.now()
- else:
- obj.creator = user
- obj.created = timezone.now()
-
- return obj
-
- @classmethod
- def filter_by_collection_and_date(cls, collection, date):
- return cls.objects.filter(
- ~Q(log_file__status__in=[
- choices.LOG_FILE_STATUS_CREATED,
- choices.LOG_FILE_STATUS_INVALIDATED
- ]),
- log_file__collection__acron3=collection,
- date=date,
- )
-
- @classmethod
- def get_number_of_found_files_for_date(cls, collection, date):
- return cls.objects.filter(
- ~Q(log_file__status__in=[
- choices.LOG_FILE_STATUS_CREATED,
- choices.LOG_FILE_STATUS_INVALIDATED
- ]),
- log_file__collection__acron3=collection,
- date=date,
- ).count()
-
- def __str__(self):
- return f'{self.log_file.path}-{self.date}'
-
-
-class CollectionLogFileDateCount(CommonControlField):
- collection = models.ForeignKey(
- Collection,
- verbose_name=_('Collection'),
- on_delete=models.DO_NOTHING,
- null=False,
- blank=False,
- )
-
- date = models.DateField(
- _('Date'),
- null=False,
- blank=False,
- )
-
- year = models.IntegerField(
- _('Year'),
- null=False,
- blank=False,
- )
-
- month = models.IntegerField(
- _('Month'),
- null=False,
- blank=False,
- )
-
- found_log_files = models.IntegerField(
- verbose_name=_('Number of Found Valid Log Files'),
- default=0,
- )
-
- expected_log_files = models.IntegerField(
- verbose_name=_('Number of Expected Valid Log Files'),
- blank=True,
- null=True,
- )
-
- is_usage_metric_computed = models.BooleanField(
- verbose_name=_('Is Usage Metric Computed'),
- default=False,
- )
-
- exported_files_count = models.SmallIntegerField(
- verbose_name=_('Exported Files Count'),
- default=0,
- )
-
- status = models.CharField(
- verbose_name=_('Status'),
- choices=choices.COLLECTION_LOG_FILE_DATE_COUNT,
- max_length=3,
- )
-
- def set_status(self):
- if self.found_log_files < self.expected_log_files:
- self.status = choices.COLLECTION_LOG_FILE_DATE_COUNT_MISSING_FILES
- elif self.found_log_files > self.expected_log_files:
- self.status = choices.COLLECTION_LOG_FILE_DATE_COUNT_EXTRA_FILES
- else:
- self.status = choices.COLLECTION_LOG_FILE_DATE_COUNT_OK
-
- def set_is_usage_metric_computed(self):
- if self.exported_files_count == self.found_log_files:
- self.is_usage_metric_computed = True
-
- @classmethod
- def create_or_update(cls, user, collection, date, expected_log_files, found_log_files):
- obj, created = cls.objects.get_or_create(
- collection=collection,
- date=date,
- month=date.month,
- year=date.year,
- )
-
- if not created:
- obj.updated_by = user
- obj.updated = timezone.now()
- else:
- obj.creator = user
- obj.created = timezone.now()
-
- obj.expected_log_files = expected_log_files
- obj.found_log_files = found_log_files
- obj.set_status()
-
- obj.save()
- return obj
-
- class Meta:
- ordering = ['-date']
- verbose_name = _("Collection Log File Date Count")
- unique_together = (
- 'collection',
- 'date',
- )
-
- panels = [
- AutocompletePanel('collection'),
- FieldPanel('date'),
- FieldPanel('year'),
- FieldPanel('month'),
- FieldPanel('found_log_files'),
- FieldPanel('expected_log_files'),
- FieldPanel('status'),
- FieldPanel('is_usage_metric_computed'),
- ]
-
- def __str__(self):
- return f'{self.collection.acron3}-{self.date}'
-
-
-class LogFile(CommonControlField):
+class LogFile(models.Model):
+ created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True)
+ updated = models.DateTimeField(verbose_name=_("Last update date"), auto_now=True)
+ date = models.DateField(verbose_name=_("Date"), null=True, blank=True, db_index=True)
hash = models.CharField(_("Hash MD5"), max_length=32, null=True, blank=True, unique=True)
path = models.CharField(_("Name"), max_length=255, null=False, blank=False)
@@ -246,19 +57,25 @@ class LogFile(CommonControlField):
default=0,
)
+ parse_heartbeat_at = models.DateTimeField(
+ _("Parse Heartbeat At"),
+ null=True,
+ blank=True,
+ )
+
panels = [
FieldPanel('hash'),
+ FieldPanel('date'),
FieldPanel('path'),
FieldPanel('stat_result'),
FieldPanel('status'),
FieldPanel('validation'),
FieldPanel('summary'),
FieldPanel('last_processed_line'),
+ FieldPanel('parse_heartbeat_at'),
AutocompletePanel('collection'),
]
- base_form_class = CoreAdminModelForm
-
class Meta:
verbose_name = _("Log File")
verbose_name_plural = _("Log Files")
@@ -268,25 +85,28 @@ def get(cls, hash):
return cls.objects.get(hash=hash)
@classmethod
- def create_or_update(cls, user, collection, path, stat_result, hash, status=None):
+ def create_or_update(cls, collection, path, stat_result, hash, status=None):
try:
+ obj, created = cls.objects.get_or_create(
+ hash=hash,
+ defaults={
+ "collection": collection,
+ "path": path,
+ "stat_result": stat_result,
+ "status": status or choices.LOG_FILE_STATUS_CREATED,
+ },
+ )
+ except IntegrityError:
obj = cls.get(hash=hash)
- obj.updated_by = user
+ created = False
+
+ if created:
+ logging.info(f'File {path} added to the database.')
+ else:
obj.updated = timezone.now()
+ obj.save(update_fields=["updated"])
logging.info(f'File {path} already exists in the database.')
- except cls.DoesNotExist:
- obj = cls()
- obj.creator = user
- obj.created = timezone.now()
- obj.collection = collection
- obj.path = path
- obj.stat_result = stat_result
- obj.hash = hash
- obj.status = status or choices.LOG_FILE_STATUS_CREATED
- logging.info(f'File {path} added to the database.')
-
- obj.save()
return obj
def __str__(self):
diff --git a/log_manager_config/exceptions.py b/log_manager_config/exceptions.py
index ad7581a..0a6a6a9 100644
--- a/log_manager_config/exceptions.py
+++ b/log_manager_config/exceptions.py
@@ -4,11 +4,5 @@ class UndefinedCollectionLogDirectoryError(Exception):
class UndefinedCollectionEmailError(Exception):
...
-class UndefinedCollectionFilesPerDayError(Exception):
- ...
-
class UndefinedSupportedLogFile(Exception):
...
-
-class MultipleFilesPerDayForTheSameDateError(Exception):
- ...
diff --git a/log_manager_config/migrations/0004_logmanagercollectionconfig_and_more.py b/log_manager_config/migrations/0004_logmanagercollectionconfig_and_more.py
new file mode 100644
index 0000000..5b6351c
--- /dev/null
+++ b/log_manager_config/migrations/0004_logmanagercollectionconfig_and_more.py
@@ -0,0 +1,223 @@
+# Generated by Django 5.2.12 on 2026-05-01 22:27
+
+import django.db.models.deletion
+import modelcluster.fields
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("collection", "0001_initial"),
+ ("log_manager_config", "0003_alter_collectionemail_options_and_more"),
+ migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name="LogManagerCollectionConfig",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ (
+ "created",
+ models.DateTimeField(
+ auto_now_add=True, verbose_name="Creation date"
+ ),
+ ),
+ (
+ "updated",
+ models.DateTimeField(
+ auto_now=True, verbose_name="Last update date"
+ ),
+ ),
+ (
+ "sample_size",
+ models.FloatField(default=0.1, verbose_name="Sample Size"),
+ ),
+ (
+ "buffer_size",
+ models.IntegerField(default=2048, verbose_name="Buffer Size"),
+ ),
+ (
+ "expected_logs_per_day",
+ models.IntegerField(
+ default=1, verbose_name="Expected Logs Per Day"
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "Log Manager Collection Config",
+ "verbose_name_plural": "Log Manager Collection Configs",
+ },
+ ),
+ migrations.RemoveField(
+ model_name="collectionlogfilesperday",
+ name="collection",
+ ),
+ migrations.RemoveField(
+ model_name="collectionlogfilesperday",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="collectionlogfilesperday",
+ name="updated_by",
+ ),
+ migrations.RemoveField(
+ model_name="collectionurltranslatorclass",
+ name="collection",
+ ),
+ migrations.RemoveField(
+ model_name="collectionurltranslatorclass",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="collectionurltranslatorclass",
+ name="directory",
+ ),
+ migrations.RemoveField(
+ model_name="collectionurltranslatorclass",
+ name="updated_by",
+ ),
+ migrations.RemoveField(
+ model_name="collectionvalidationparameters",
+ name="collection",
+ ),
+ migrations.RemoveField(
+ model_name="collectionvalidationparameters",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="collectionvalidationparameters",
+ name="updated_by",
+ ),
+ migrations.RemoveField(
+ model_name="supportedlogfile",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="supportedlogfile",
+ name="updated_by",
+ ),
+ migrations.RemoveConstraint(
+ model_name="collectionemail",
+ name="unique_collection_email",
+ ),
+ migrations.RemoveConstraint(
+ model_name="collectionlogdirectory",
+ name="unique_collection_path",
+ ),
+ migrations.RemoveField(
+ model_name="collectionemail",
+ name="collection",
+ ),
+ migrations.RemoveField(
+ model_name="collectionlogdirectory",
+ name="collection",
+ ),
+ migrations.AddField(
+ model_name="collectionemail",
+ name="sort_order",
+ field=models.IntegerField(blank=True, editable=False, null=True),
+ ),
+ migrations.AddField(
+ model_name="collectionlogdirectory",
+ name="sort_order",
+ field=models.IntegerField(blank=True, editable=False, null=True),
+ ),
+ migrations.AddField(
+ model_name="collectionlogdirectory",
+ name="translator_class",
+ field=models.CharField(
+ default="URLTranslatorClassicSite", verbose_name="URL Translator Class"
+ ),
+ ),
+ migrations.AddField(
+ model_name="logmanagercollectionconfig",
+ name="collection",
+ field=models.OneToOneField(
+ on_delete=django.db.models.deletion.CASCADE,
+ related_name="log_manager_config",
+ to="collection.collection",
+ verbose_name="Collection",
+ ),
+ ),
+ migrations.AddField(
+ model_name="logmanagercollectionconfig",
+ name="creator",
+ field=models.ForeignKey(
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_creator",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Creator",
+ ),
+ ),
+ migrations.AddField(
+ model_name="logmanagercollectionconfig",
+ name="updated_by",
+ field=models.ForeignKey(
+ blank=True,
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_last_mod_user",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Updater",
+ ),
+ ),
+ migrations.AddField(
+ model_name="collectionemail",
+ name="config",
+ field=modelcluster.fields.ParentalKey(
+ blank=True,
+ null=True,
+ on_delete=django.db.models.deletion.CASCADE,
+ related_name="emails",
+ to="log_manager_config.logmanagercollectionconfig",
+ ),
+ ),
+ migrations.AddField(
+ model_name="collectionlogdirectory",
+ name="config",
+ field=modelcluster.fields.ParentalKey(
+ blank=True,
+ null=True,
+ on_delete=django.db.models.deletion.CASCADE,
+ related_name="directories",
+ to="log_manager_config.logmanagercollectionconfig",
+ ),
+ ),
+ migrations.AddConstraint(
+ model_name="collectionemail",
+ constraint=models.UniqueConstraint(
+ fields=("config", "email"), name="unique_config_email"
+ ),
+ ),
+ migrations.AddConstraint(
+ model_name="collectionlogdirectory",
+ constraint=models.UniqueConstraint(
+ fields=("config", "path"), name="unique_config_path"
+ ),
+ ),
+ migrations.DeleteModel(
+ name="CollectionLogFilesPerDay",
+ ),
+ migrations.DeleteModel(
+ name="CollectionURLTranslatorClass",
+ ),
+ migrations.DeleteModel(
+ name="CollectionValidationParameters",
+ ),
+ migrations.DeleteModel(
+ name="SupportedLogFile",
+ ),
+ ]
diff --git a/log_manager_config/models.py b/log_manager_config/models.py
index 384368e..8cf3e34 100644
--- a/log_manager_config/models.py
+++ b/log_manager_config/models.py
@@ -4,38 +4,57 @@
from django.utils import timezone
from django.utils.translation import gettext_lazy as _
+from modelcluster.models import ClusterableModel
+from modelcluster.fields import ParentalKey
+from wagtail.models import Orderable
+from wagtail.admin.panels import FieldPanel, InlinePanel
+from wagtailautocomplete.edit_handlers import AutocompletePanel
+
from collection.models import Collection
from core.models import CommonControlField
-from .exceptions import MultipleFilesPerDayForTheSameDateError, UndefinedCollectionFilesPerDayError
-class CollectionLogDirectory(CommonControlField):
- collection = models.ForeignKey(
+class LogManagerCollectionConfig(ClusterableModel, CommonControlField):
+ collection = models.OneToOneField(
Collection,
verbose_name=_('Collection'),
- on_delete=models.DO_NOTHING,
+ on_delete=models.CASCADE,
+ related_name="log_manager_config"
)
- path = models.CharField(
- verbose_name=_('Path'),
- max_length=255,
- blank=False,
+ sample_size = models.FloatField(
+ verbose_name=_('Sample Size'),
+ blank=False,
null=False,
+ default=0.1,
)
- directory_name = models.CharField(
- verbose_name=_('Directory Name'),
- max_length=255,
- blank=True,
- null=True,
+ buffer_size = models.IntegerField(
+ verbose_name=_('Buffer Size'),
+ blank=False,
+ null=False,
+ default=2048,
)
- active = models.BooleanField(
- verbose_name=_('Active'),
- default=True,
+ expected_logs_per_day = models.IntegerField(
+ verbose_name=_('Expected Logs Per Day'),
+ default=1,
)
+ panels = [
+ AutocompletePanel("collection"),
+ FieldPanel("sample_size"),
+ FieldPanel("buffer_size"),
+ FieldPanel("expected_logs_per_day"),
+ InlinePanel("directories", label=_("Directories")),
+ InlinePanel("emails", label=_("Emails")),
+ ]
+
def __str__(self):
- return f'{self.collection} - {self.path} - {self.directory_name}'
-
+ return f'{self.collection.acron3} Config'
+
+ class Meta:
+ verbose_name = _('Log Manager Collection Config')
+ verbose_name_plural = _('Log Manager Collection Configs')
+
@classmethod
def load(cls, data, user):
for item in data:
@@ -45,13 +64,12 @@ def load(cls, data, user):
logging.warning(f'Collection {item.get("acronym")} not found.')
continue
- logging.info(item)
cls.create_or_update(
user=user,
collection=collection,
- directory_name=item.get('directory_name'),
- path=item.get('path'),
- active=item.get('active', True),
+ sample_size=item.get('sample_size', 0.1),
+ buffer_size=item.get('buffer_size', 2048),
+ expected_logs_per_day=item.get('quantity', 1),
)
@classmethod
@@ -59,81 +77,66 @@ def create_or_update(
cls,
user,
collection,
- directory_name,
- path,
- active,
+ sample_size,
+ buffer_size,
+ expected_logs_per_day,
):
- try:
- obj = cls.objects.get(collection=collection, path=path)
- except cls.DoesNotExist:
- obj = cls()
+ obj, created = cls.objects.get_or_create(collection=collection)
+ if created:
obj.creator = user
obj.created = timezone.now()
- obj.collection = collection
obj.updated_by = user
obj.updated = timezone.now()
- obj.directory_name = directory_name
- obj.path = path
- obj.active = active
-
+ obj.sample_size = sample_size
+ obj.buffer_size = buffer_size
+ obj.expected_logs_per_day = expected_logs_per_day
obj.save()
- logging.info(f'{collection.acron3} - {directory_name} - {path}')
+ logging.info(f'Config for {collection.acron3} updated.')
return obj
- class Meta:
- verbose_name = _('Collection Log Directory')
- verbose_name_plural = _('Collection Log Directories')
- constraints = [
- models.UniqueConstraint(fields=['collection', 'path'], name='unique_collection_path')
- ]
-class CollectionLogFilesPerDay(CommonControlField):
- collection = models.ForeignKey(
- Collection,
- verbose_name=_('Collection'),
- on_delete=models.DO_NOTHING,
+class CollectionLogDirectory(Orderable, CommonControlField):
+ config = ParentalKey(
+ 'LogManagerCollectionConfig',
+ related_name='directories',
+ on_delete=models.CASCADE,
+ null=True,
+ blank=True,
)
- start_date = models.DateField(
- verbose_name=_('Start Date'),
- blank=False,
+ path = models.CharField(
+ verbose_name=_('Path'),
+ max_length=255,
+ blank=False,
null=False,
)
- end_date = models.DateField(
- verbose_name=_('End Date'),
+ directory_name = models.CharField(
+ verbose_name=_('Directory Name'),
+ max_length=255,
blank=True,
null=True,
)
- quantity = models.IntegerField(
- verbose_name=_('Quantity'),
- default=1,
+ active = models.BooleanField(
+ verbose_name=_('Active'),
+ default=True,
+ )
+ translator_class = models.CharField(
+ verbose_name=_('URL Translator Class'),
+ blank=False,
+ null=False,
+ default='URLTranslatorClassicSite',
)
def __str__(self):
- return f'{self.start_date} - {self.quantity}'
+ return f'{self.config.collection} - {self.path} - {self.directory_name}'
- @classmethod
- def get_number_of_expected_files_by_day(cls, collection, date):
- files_by_day = cls.objects.filter(
- models.Q(collection__acron3=collection) &
- models.Q(start_date__lte=date) &
- (models.Q(end_date__gte=date) | models.Q(end_date__isnull=True))
- )
-
- if files_by_day.count() > 1:
- raise MultipleFilesPerDayForTheSameDateError(_("ERROR. Please, set the field end_date for the collection {collection}."))
-
- if files_by_day.count() == 0:
- raise UndefinedCollectionFilesPerDayError(_("ERROR. Please, set the number of files per day for the collection {collection}."))
-
- return int(files_by_day.get().quantity)
-
@classmethod
def load(cls, data, user):
for item in data:
try:
collection = Collection.objects.get(acron3=item.get('acronym'))
+ config, _ = LogManagerCollectionConfig.objects.get_or_create(collection=collection)
except Collection.DoesNotExist:
logging.warning(f'Collection {item.get("acronym")} not found.')
continue
@@ -141,52 +144,55 @@ def load(cls, data, user):
logging.info(item)
cls.create_or_update(
user=user,
- collection=collection,
- start_date=item.get('start_date'),
- quantity=item.get('quantity'),
- end_date=item.get('end_date'),
+ config=config,
+ directory_name=item.get('directory_name'),
+ path=item.get('path'),
+ active=item.get('active', True),
)
@classmethod
def create_or_update(
cls,
user,
- collection,
- start_date,
- quantity,
- end_date,
+ config,
+ directory_name,
+ path,
+ active,
):
try:
- obj = cls.objects.get(collection=collection, start_date=start_date)
+ obj = cls.objects.get(config=config, path=path)
except cls.DoesNotExist:
obj = cls()
obj.creator = user
obj.created = timezone.now()
- obj.collection = collection
-
+ obj.config = config
+
obj.updated_by = user
obj.updated = timezone.now()
- obj.start_date = start_date
- obj.quantity = quantity
- obj.end_date = end_date
-
+ obj.directory_name = directory_name
+ obj.path = path
+ obj.active = active
+
obj.save()
- logging.info(f'{collection.acron3} - {start_date} - {quantity}')
+ logging.info(f'{config.collection.acron3} - {directory_name} - {path}')
return obj
class Meta:
- verbose_name = _('Collection Log Files Per Day')
- verbose_name_plural = _('Collection Log Files Per Day')
+ verbose_name = _('Collection Log Directory')
+ verbose_name_plural = _('Collection Log Directories')
constraints = [
- models.UniqueConstraint(fields=['collection', 'start_date'], name='unique_collection_start_date')
+ models.UniqueConstraint(fields=['config', 'path'], name='unique_config_path')
]
-class CollectionEmail(CommonControlField):
- collection = models.ForeignKey(
- Collection,
- verbose_name=_('Collection'),
- on_delete=models.DO_NOTHING,
+
+class CollectionEmail(Orderable, CommonControlField):
+ config = ParentalKey(
+ 'LogManagerCollectionConfig',
+ related_name='emails',
+ on_delete=models.CASCADE,
+ null=True,
+ blank=True,
)
name = models.CharField(
verbose_name=_('Name'),
@@ -218,6 +224,7 @@ def load(cls, data, user):
for item in data:
try:
collection = Collection.objects.get(acron3=item.get('acronym'))
+ config, _ = LogManagerCollectionConfig.objects.get_or_create(collection=collection)
except Collection.DoesNotExist:
logging.warning(f'Collection {item.get("acronym")} not found.')
continue
@@ -225,7 +232,7 @@ def load(cls, data, user):
logging.info(item)
cls.create_or_update(
user=user,
- collection=collection,
+ config=config,
email=item.get('e-mail'),
name=item.get('name'),
position=item.get('position'),
@@ -236,19 +243,19 @@ def load(cls, data, user):
def create_or_update(
cls,
user,
- collection,
+ config,
email,
name,
position,
active,
):
try:
- obj = cls.objects.get(collection=collection, email=email)
+ obj = cls.objects.get(config=config, email=email)
except cls.DoesNotExist:
obj = cls()
obj.creator = user
obj.created = timezone.now()
- obj.collection = collection
+ obj.config = config
obj.email = email
obj.updated_by = user
@@ -258,213 +265,14 @@ def create_or_update(
obj.active = active
obj.save()
- logging.info(f'{collection.acron3} - {name} - {position} - {email}')
+ logging.info(f'{config.collection.acron3} - {name} - {position} - {email}')
return obj
class Meta:
verbose_name = _('Collection Email')
verbose_name_plural = _('Collection Emails')
constraints = [
- models.UniqueConstraint(fields=['collection', 'email'], name='unique_collection_email')
- ]
-
-
-class CollectionValidationParameters(CommonControlField):
- collection = models.ForeignKey(
- Collection,
- verbose_name=_('Collection'),
- on_delete=models.DO_NOTHING,
- primary_key=True,
- )
- sample_size = models.FloatField(
- verbose_name=_('Sample Size'),
- blank=False,
- null=False,
- default=0.1,
- )
- buffer_size = models.IntegerField(
- verbose_name=_('Buffer Size'),
- blank=False,
- null=False,
- default=2048,
- )
-
- def __str__(self):
- return f'{self.collection.acron3} - {self.sample_size} - {self.buffer_size}'
-
- @classmethod
- def load(cls, data, user):
- for item in data:
- try:
- collection = Collection.objects.get(acron3=item.get('acronym'))
- except Collection.DoesNotExist:
- logging.warning(f'Collection {item.get("acronym")} not found.')
- continue
-
- logging.info(item)
- cls.create_or_update(
- user=user,
- collection=collection,
- sample_size=item.get('sample_size'),
- buffer_size=item.get('buffer_size'),
- )
-
- @classmethod
- def create_or_update(
- cls,
- user,
- collection,
- sample_size,
- buffer_size,
- ):
- try:
- obj = cls.objects.get(collection=collection)
- except cls.DoesNotExist:
- obj = cls()
- obj.creator = user
- obj.created = timezone.now()
- obj.collection = collection
-
- obj.updated_by = user
- obj.updated = timezone.now()
- obj.sample_size = sample_size
- obj.buffer_size = buffer_size
-
- obj.save()
- logging.info(f'{collection.acron3} - {sample_size} - {buffer_size}')
- return obj
-
- class Meta:
- verbose_name = _('Collection Validation Parameters')
- verbose_name_plural = _('Collection Validation Parameters')
-
-
-class CollectionURLTranslatorClass(CommonControlField):
- collection = models.ForeignKey(
- Collection,
- verbose_name=_('Collection'),
- on_delete=models.DO_NOTHING,
- )
- directory = models.ForeignKey(
- CollectionLogDirectory,
- verbose_name=_('Directory'),
- on_delete=models.DO_NOTHING,
- )
- translator_class = models.CharField(
- verbose_name=_('URL Translator Class'),
- blank=False,
- null=False,
- default='URLTranslatorClassicSite',
- )
-
- def __str__(self):
- return f'{self.collection.acron3} - {self.directory} - {self.translator_class}'
-
- class Meta:
- verbose_name = _('Collection URL Translator Class')
- verbose_name_plural = _('Collection URL Translator Classes')
- constraints = [
- models.UniqueConstraint(fields=['collection', 'directory'], name='unique_collection_directory')
+ models.UniqueConstraint(fields=['config', 'email'], name='unique_config_email')
]
- @classmethod
- def load(cls, data, user):
- for item in data:
- try:
- collection = Collection.objects.get(acron3=item.get('acronym'))
- except Collection.DoesNotExist:
- logging.warning(f'Collection {item.get("acronym")} not found.')
- continue
-
- try:
- directory = CollectionLogDirectory.objects.get(collection=collection, path=item.get('path'))
- logging.info(item)
- cls.create_or_update(
- user=user,
- collection=collection,
- directory=directory,
- translator_class=item.get('translator_class'),
- )
- except CollectionLogDirectory.DoesNotExist:
- logging.warning(f'Directory {item.get("path")} not found.')
- continue
- @classmethod
- def create_or_update(
- cls,
- user,
- collection,
- directory,
- translator_class,
- ):
- try:
- obj = cls.objects.get(collection=collection)
- except cls.DoesNotExist:
- obj = cls()
- obj.creator = user
- obj.created = timezone.now()
- obj.collection = collection
- obj.directory = directory
-
- obj.updated_by = user
- obj.updated = timezone.now()
- obj.translator_class = translator_class
-
- obj.save()
- logging.info(f'{collection.acron3} - {directory.path} - {translator_class}')
- return obj
-
-
-class SupportedLogFile(CommonControlField):
- file_extension = models.CharField(
- verbose_name=_('File Extension'),
- max_length=255,
- unique=True,
- blank=False,
- null=False,
- )
- description = models.TextField(
- verbose_name=_('Description'),
- blank=True,
- null=True,
- )
-
- def __str__(self):
- return f'{self.file_extension}'
-
- @classmethod
- def load(cls, data, user):
- for item in data:
- logging.info(item)
- cls.create_or_update(
- user=user,
- file_extension=item.get('file_extension'),
- description=item.get('description'),
- )
-
- @classmethod
- def create_or_update(
- cls,
- user,
- file_extension,
- description,
- ):
- try:
- obj = cls.objects.get(file_extension=file_extension)
- except cls.DoesNotExist:
- obj = cls()
- obj.creator = user
- obj.created = timezone.now()
-
- obj.updated_by = user
- obj.updated = timezone.now()
- obj.file_extension = file_extension
- obj.description = description
-
- obj.save()
- logging.info(f'{file_extension}')
- return obj
-
- class Meta:
- verbose_name = _('Supported Log File')
- verbose_name_plural = _('Supported Log Files')
diff --git a/merge_production_dotenvs_in_dotenv.py b/merge_production_dotenvs_in_dotenv.py
deleted file mode 100644
index d1170ef..0000000
--- a/merge_production_dotenvs_in_dotenv.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import os
-from pathlib import Path
-from typing import Sequence
-
-import pytest
-
-ROOT_DIR_PATH = Path(__file__).parent.resolve()
-PRODUCTION_DOTENVS_DIR_PATH = ROOT_DIR_PATH / ".envs" / ".production"
-PRODUCTION_DOTENV_FILE_PATHS = [
- PRODUCTION_DOTENVS_DIR_PATH / ".django",
- PRODUCTION_DOTENVS_DIR_PATH / ".postgres",
-]
-DOTENV_FILE_PATH = ROOT_DIR_PATH / ".env"
-
-
-def merge(
- output_file_path: str, merged_file_paths: Sequence[str], append_linesep: bool = True
-) -> None:
- with open(output_file_path, "w") as output_file:
- for merged_file_path in merged_file_paths:
- with open(merged_file_path, "r") as merged_file:
- merged_file_content = merged_file.read()
- output_file.write(merged_file_content)
- if append_linesep:
- output_file.write(os.linesep)
-
-
-def main():
- merge(DOTENV_FILE_PATH, PRODUCTION_DOTENV_FILE_PATHS)
-
-
-@pytest.mark.parametrize("merged_file_count", range(3))
-@pytest.mark.parametrize("append_linesep", [True, False])
-def test_merge(tmpdir_factory, merged_file_count: int, append_linesep: bool):
- tmp_dir_path = Path(str(tmpdir_factory.getbasetemp()))
-
- output_file_path = tmp_dir_path / ".env"
-
- expected_output_file_content = ""
- merged_file_paths = []
- for i in range(merged_file_count):
- merged_file_ord = i + 1
-
- merged_filename = ".service{}".format(merged_file_ord)
- merged_file_path = tmp_dir_path / merged_filename
-
- merged_file_content = merged_filename * merged_file_ord
-
- with open(merged_file_path, "w+") as file:
- file.write(merged_file_content)
-
- expected_output_file_content += merged_file_content
- if append_linesep:
- expected_output_file_content += os.linesep
-
- merged_file_paths.append(merged_file_path)
-
- merge(output_file_path, merged_file_paths, append_linesep)
-
- with open(output_file_path, "r") as output_file:
- actual_output_file_content = output_file.read()
-
- assert actual_output_file_content == expected_output_file_content
-
-
-if __name__ == "__main__":
- main()
diff --git a/metrics/es.py b/metrics/es.py
deleted file mode 100644
index 25ad701..0000000
--- a/metrics/es.py
+++ /dev/null
@@ -1,385 +0,0 @@
-import logging
-
-from elasticsearch import Elasticsearch, helpers, NotFoundError
-from django.conf import settings
-
-from .utils import index_utils
-
-
-DEFAULT_ES_INDEX_USAGE_MAPPINGS = {
- "properties": {
- "collection": {
- "type": "keyword"
- },
- "journal": {
- "properties": {
- "scielo_issn": {
- "type": "keyword"
- },
- "main_title": {
- "type": "keyword"
- },
- "subject_area_capes": {
- "type": "keyword"
- },
- "subject_area_wos": {
- "type": "keyword"
- },
- "acronym": {
- "type": "keyword"
- },
- "publisher": {
- "type": "keyword"
- }
- }
- },
- "pid": {
- "type": "keyword"
- },
- "pid_v2": {
- "type": "keyword"
- },
- "pid_v3": {
- "type": "keyword"
- },
- "pid_generic": {
- "type": "keyword"
- },
- "year_of_publication": {
- "type": "integer"
- },
- "media_language": {
- "type": "keyword"
- },
- "country_code": {
- "type": "keyword"
- },
- "date": {
- "type": "date",
- "format": "yyyy-MM-dd"
- },
- "total_requests": {
- "type": "integer"
- },
- "total_investigations": {
- "type": "integer"
- },
- "unique_requests": {
- "type": "integer"
- },
- "unique_investigations": {
- "type": "integer"
- }
- }
-}
-
-
-class ElasticSearchUsageWrapper:
- """
- Wrapper for Elasticsearch usage metrics operations.
- This class provides methods to interact with Elasticsearch for indexing,
- deleting, and managing usage metrics data.
- """
-
- def __init__(self, url=None, basic_auth=None, api_key=None, verify_certs=False):
- self.client = self.get_elasticsearch_client(url, basic_auth, api_key, verify_certs)
-
-
- def get_elasticsearch_client(self, url=None, basic_auth=None, api_key=None, verify_certs=False):
- """
- Create an Elasticsearch client instance using Django settings.
-
- :param url: Elasticsearch URL. If None, it will be taken from Django settings.
- :param basic_auth: Basic authentication credentials. If None, it will be taken from Django settings.
- :param api_key: API key. If None, it will be taken from Django settings.
- :param verify_certs: Whether to verify SSL certificates. If None, it will be taken from Django settings.
- """
- if not url:
- url = getattr(settings, "ES_URL", None)
-
- if not basic_auth:
- basic_auth = getattr(settings, "ES_BASIC_AUTH", None)
-
- if not api_key:
- api_key = getattr(settings, "ES_API_KEY", None)
-
- if not verify_certs:
- verify_certs = getattr(settings, "ES_VERIFY_CERTS", False)
-
- if basic_auth:
- client = Elasticsearch(url, basic_auth=basic_auth, verify_certs=verify_certs)
- elif api_key:
- client = Elasticsearch(url, api_key=api_key, verify_certs=verify_certs)
- else:
- client = Elasticsearch(url, verify_certs=verify_certs)
-
- return client
-
-
- def ping(self):
- """
- Check if the Elasticsearch client is available.
- Returns True if the client is available, False otherwise.
- """
- try:
- return self.client.ping()
- except Exception as e:
- logging.error(f"Error pinging Elasticsearch client: {e}")
- return False
-
-
- def create_index(self, index_name, mappings=None, ping_client=False):
- """
- Create an Elasticsearch index.
-
- :param index_name: Name of the index to create.
- :param mappings: Mappings for the index. If None, default mappings will be used.
- :param ping_client: If True, checks if the Elasticsearch client is available before creating the index.
- """
- if ping_client and not self.ping():
- return
-
- if not mappings:
- mappings = DEFAULT_ES_INDEX_USAGE_MAPPINGS
-
- resp = self.client.indices.create(
- index=index_name,
- mappings=mappings,
- )
- logging.info(f"Index {index_name} created: {resp}")
-
-
- def create_index_if_not_exists(self, index_name, mappings=None, ping_client=False):
- """
- Create an Elasticsearch index if it does not already exist.
-
- :param index_name: Name of the index to create.
- :param mappings: Mappings for the index. If None, default mappings will be used.
- :param ping_client: If True, checks if the Elasticsearch client is available before creating the index.
- """
- if ping_client and not self.ping():
- return
-
- if not self.client.indices.exists(index=index_name):
- self.create_index(index_name, mappings, ping_client)
- else:
- logging.info(f"Index {index_name} already exists. Skipping creation.")
-
-
- def delete_index(self, index_name, ping_client=False):
- """
- Delete an Elasticsearch index.
-
- :param index_name: Name of the index to delete.
- :param ping_client: If True, checks if the Elasticsearch client is available before deleting the index.
- """
- if ping_client and not self.ping():
- return
-
- self.client.indices.delete(index=index_name)
-
-
- def index_document(self, index_name, doc_id, document, ping_client=False):
- """
- Index a document in Elasticsearch.
-
- :param index_name: Name of the index.
- :param doc_id: ID of the document.
- :param document: Document to index.
- :param ping_client: If True, checks if the Elasticsearch client is available before indexing the document.
- """
- if ping_client and not self.ping():
- return
-
- self.client.index(index=index_name, id=doc_id, document=document)
-
-
- def index_documents(self, index_name, documents, ping_client=False):
- """
- Index multiple documents in Elasticsearch.
-
- :param index_name: Name of the index.
- :param documents: Dictionary of documents to index, where keys are document IDs and values are the documents.
- :param ping_client: If True, checks if the Elasticsearch client is available before indexing the documents.
- """
- if ping_client and not self.ping():
- return
-
- helpers.bulk(
- self.client,
- (
- {
- "_index": index_name,
- "_id": doc_id,
- "_source": document,
- }
- for doc_id, document in documents.items()
- ),
- )
-
-
- def delete_document(self, index_name, doc_id, ping_client=False):
- """
- Delete a document from Elasticsearch.
-
- :param index_name: Name of the index.
- :param doc_id: ID of the document to delete.
- :param ping_client: If True, checks if the Elasticsearch client is available before deleting the document.
- """
- if ping_client and not self.ping():
- return
-
- try:
- self.client.delete(index=index_name, id=doc_id)
- except NotFoundError as e:
- logging.error(f"Failed to delete document {doc_id} from Elasticsearch: {e}")
-
-
- def delete_documents(self, index_name, doc_ids, ping_client=False):
- """
- Delete multiple documents from Elasticsearch using bulk.
- :param index_name: Name of the index.
- :param doc_ids: List of document IDs to delete.
- :param ping_client: If True, checks if the Elasticsearch client is available before deleting the documents.
- """
- if ping_client and not self.ping():
- return
-
- actions = (
- {
- "_op_type": "delete",
- "_index": index_name,
- "_id": doc_id,
- }
- for doc_id in doc_ids
- )
-
- try:
- helpers.bulk(self.client, actions)
- except helpers.BulkIndexError as e:
- logging.error(f"BulkIndexError occurred: {e.errors}")
-
-
- def delete_documents_by_key(self, index_name, data, ping_client=False):
- """
- Delete multiple documents from Elasticsearch based on specific key-value pairs.
-
- :param index_name: Name of the index.
- :param data: Dictionary where keys are field names and values are single values or lists of values.
- :param ping_client: If True, checks if the Elasticsearch client is available before deleting the documents.
- """
- if ping_client and not self.ping():
- return
-
- query = {
- "query": {
- "bool": {
- "must": [
- {
- "terms": {
- key: values if isinstance(values, list) else [values]
- }
- }
- for key, values in data.items()
- ]
- }
- }
- }
-
- try:
- self.client.delete_by_query(index=index_name, body=query)
- return True
- except Exception as e:
- logging.error(f"Failed to delete documents: {e}")
-
- return False
-
-
- def fetch_and_update_documents_locally(self, index_name, documents, batch_size=5000, ping_client=False):
- """
- Fetch existing documents from Elasticsearch and update local documents with accumulated metrics.
- This function retrieves documents from Elasticsearch in batches and merges their metric fields
- with the provided local documents. The merge operation adds values for specific metric fields
- or sets them if they don't exist in the local documents.
-
- Args:
- index_name (str): Name of the Elasticsearch index to fetch documents from.
- documents (dict): Dictionary of documents to be updated, where keys are document IDs and values
- are dictionaries containing metric data.
- batch_size (int, optional): Number of documents to fetch in each batch from Elasticsearch.
- Defaults to 5000.
- ping_client (bool, optional): If True, checks if the Elasticsearch client is available before
- fetching documents. Defaults to False.
-
- Returns:
- None: The function modifies the input documents dictionary in-place.
- """
- if ping_client and not self.ping():
- return
-
- existing_docs = {}
- ids = list(documents.keys())
-
- for i in range(0, len(ids), batch_size):
- batch_ids = ids[i:i+batch_size]
- resp = self.client.mget(index=index_name, ids=batch_ids)
- for doc in resp.get('docs', []):
- if doc.get('found'):
- existing_docs[doc['_id']] = doc['_source']
- logging.info(f'Found {len(existing_docs)} existing documents in Elasticsearch for update.')
-
- for doc_id, existing in existing_docs.items():
- current = documents[doc_id]
- for field in [
- "total_requests",
- "unique_requests",
- "total_investigations",
- "unique_investigations",
- ]:
- if field in existing and field in current:
- current[field] += existing[field]
- elif field in existing:
- current[field] = existing[field]
-
-
- def export_to_index(self, index_name, data, batch_size=5000, ping_client=False):
- """
- Export data to Elasticsearch index in bulk operations.
- This function converts input data to index documents, processes them locally,
- and then indexes them to Elasticsearch in batches to optimize performance.
-
- Args:
- index_name (str): Name of the Elasticsearch index to export data to.
- data: The data to be exported to the Elasticsearch index
- batch_size (int, optional): Number of documents to process in each bulk operation.
- Defaults to 5000.
- ping_client (bool, optional): If True, checks if the Elasticsearch client is available
-
- Returns:
- None: Function performs side effects by indexing data to Elasticsearch
- """
- if ping_client and not self.ping():
- return
-
- bulk_data = []
- documents = index_utils.convert_to_index_documents(data)
- self.fetch_and_update_documents_locally(index_name=index_name, documents=documents)
-
- for key, metric_data in documents.items():
- metric_data['pid'] = metric_data.get('pid_v3') or metric_data.get('pid_v2') or metric_data.get('pid_generic', '')
- bulk_data.append({
- "_id": key,
- "_source": metric_data,
- })
-
- if len(bulk_data) >= batch_size:
- self.index_documents(
- index_name=index_name,
- documents={doc["_id"]: doc["_source"] for doc in bulk_data},
- )
- bulk_data = []
-
- self.index_documents(
- index_name=index_name,
- documents={doc["_id"]: doc["_source"] for doc in bulk_data},
- )
diff --git a/metrics/fixtures/top100articles.csv b/metrics/fixtures/top100articles.csv
deleted file mode 100755
index 9d979f3..0000000
--- a/metrics/fixtures/top100articles.csv
+++ /dev/null
@@ -1,97 +0,0 @@
-print_issn online_issn pid_issn collection pid yop year_month_day total_item_requests total_item_investigations unique_item_requests unique_item_investigations
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300005 2005 2024-05-26 13 16 13 16
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000100020 2009 2024-05-26 9 10 8 9
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200012 2009 2024-05-26 8 9 8 9
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200018 2009 2024-05-26 8 8 8 8
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300004 2005 2024-05-26 8 11 8 11
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200011 2009 2024-05-26 8 9 8 9
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200001 2009 2024-05-26 7 7 7 7
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200010 2009 2024-05-26 7 9 7 9
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300007 2005 2024-05-26 7 10 7 10
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200003 2009 2024-05-26 7 9 7 9
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000400010 2008 2024-05-26 7 7 7 7
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300008 2005 2024-05-26 7 9 7 9
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000400008 2009 2024-05-26 7 7 7 7
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000400009 2006 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000200009 2006 2024-05-26 6 7 6 7
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000100007 2010 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000300003 2007 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000100022 2009 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000100006 2010 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200002 2009 2024-05-26 6 7 6 7
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000100002 2010 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000200014 2007 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000100021 2009 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000400010 2010 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000200001 2010 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000200002 2010 2024-05-26 6 7 6 7
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200014 2009 2024-05-26 5 6 5 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000100014 2009 2024-05-26 5 5 5 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000200009 2005 2024-05-26 5 5 5 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200004 2009 2024-05-26 5 5 5 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000100016 2006 2024-05-26 5 5 5 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000200015 2006 2024-05-26 5 5 5 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000300005 2007 2024-05-26 5 5 5 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000300009 2009 2024-05-26 5 5 5 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000200010 2010 2024-05-26 4 4 4 4
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000100015 2008 2024-05-26 3 4 3 4
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300002 2005 2024-05-26 2 5 2 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200015 2009 2024-05-26 2 3 2 3
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300001 2005 2024-05-26 2 5 2 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300009 2005 2024-05-26 2 4 2 4
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200005 2009 2024-05-26 2 4 2 4
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200008 2009 2024-05-26 2 3 2 3
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300006 2005 2024-05-26 2 5 2 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300010 2005 2024-05-26 2 2 2 2
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300003 2005 2024-05-26 2 5 2 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000300001 2006 2024-05-26 2 2 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000100005 2009 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200016 2009 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000400004 2005 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000100009 2008 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000100014 2005 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200009 2009 2024-05-26 1 2 1 2
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000100019 2006 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200013 2009 2024-05-26 1 3 1 3
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000400007 2008 2024-05-26 1 2 1 2
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000300010 2008 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200006 2009 2024-05-26 1 3 1 3
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000200018 2006 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000400002 2008 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000300005 2010 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000200007 2006 2024-05-26 1 3 1 3
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000400004 2006 2024-05-26 1 2 1 2
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000100004 2007 2024-05-26 1 3 1 3
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000200021 2007 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000100002 2007 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000100004 2009 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000400004 2009 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000400006 2008 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000400005 2006 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000300006 2008 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000400011 2008 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000300001 2007 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000100020 2007 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000400002 2006 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000100005 2005 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200017 2009 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000100005 2008 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200007 2009 2024-05-26 1 4 1 4
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000100023 2009 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000100008 2008 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000400008 2006 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000400005 2005 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000200006 2006 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000400007 2005 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000200013 2008 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000400003 2006 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000400006 2009 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000300008 2007 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000200008 2005 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000200006 2008 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000400004 2008 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000400006 2005 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000300006 2007 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000300003 2006 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000100007 2008 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000100009 2006 2024-05-26 0 1 0 1
diff --git a/metrics/fixtures/top100articles.tar.gz b/metrics/fixtures/top100articles.tar.gz
deleted file mode 100644
index cd49556..0000000
Binary files a/metrics/fixtures/top100articles.tar.gz and /dev/null differ
diff --git a/metrics/migrations/0001_initial.py b/metrics/migrations/0001_initial.py
index 30ccc96..9746d5f 100644
--- a/metrics/migrations/0001_initial.py
+++ b/metrics/migrations/0001_initial.py
@@ -1,4 +1,4 @@
-# Generated by Django 5.0.7 on 2024-08-30 00:52
+# Generated by Codex on 2026-04-27
import django.db.models.deletion
from django.conf import settings
@@ -9,13 +9,13 @@ class Migration(migrations.Migration):
initial = True
dependencies = [
- ("wagtaildocs", "0013_delete_uploadeddocument"),
+ ("collection", "0001_initial"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
- name="Top100ArticlesFile",
+ name="DailyMetricJob",
fields=[
(
"id",
@@ -28,133 +28,85 @@ class Migration(migrations.Migration):
),
(
"created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
+ models.DateTimeField(auto_now_add=True, verbose_name="Creation date"),
),
(
"updated",
- models.DateTimeField(
- auto_now=True, verbose_name="Last update date"
- ),
+ models.DateTimeField(auto_now=True, verbose_name="Last update date"),
+ ),
+ (
+ "access_date",
+ models.DateField(db_index=True, verbose_name="Access Date"),
),
(
"status",
models.CharField(
choices=[
- ("QUE", "Queued"),
- ("PAR", "Parsing"),
- ("PRO", "Processed"),
- ("INV", "Invalidated"),
+ ("PEN", "Pending"),
+ ("EXP", "Exporting"),
+ ("SUC", "Exported"),
+ ("ERR", "Error"),
],
- default="QUE",
- max_length=5,
+ db_index=True,
+ default="PEN",
+ max_length=3,
+ verbose_name="Status",
),
),
(
- "attachment",
- models.ForeignKey(
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="+",
- to="wagtaildocs.document",
- verbose_name="Attachment",
- ),
+ "input_log_hashes",
+ models.JSONField(default=list, verbose_name="Input Log Hashes"),
),
(
- "creator",
- models.ForeignKey(
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_creator",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Creator",
+ "storage_path",
+ models.CharField(
+ blank=True,
+ default="",
+ max_length=500,
+ verbose_name="Storage Path",
),
),
(
- "updated_by",
- models.ForeignKey(
+ "payload_hash",
+ models.CharField(
blank=True,
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_last_mod_user",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Updater",
+ default="",
+ max_length=64,
+ verbose_name="Payload Hash",
),
),
- ],
- options={
- "verbose_name": "Top 100 Articles File",
- "verbose_name_plural": "Top 100 Articles Files",
- },
- ),
- migrations.CreateModel(
- name="Top100Articles",
- fields=[
(
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
+ "summary",
+ models.JSONField(blank=True, default=dict, verbose_name="Summary"),
),
(
- "created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
+ "attempts",
+ models.PositiveIntegerField(default=0, verbose_name="Attempts"),
),
(
- "updated",
- models.DateTimeField(
- auto_now=True, verbose_name="Last update date"
- ),
+ "error_message",
+ models.TextField(blank=True, default="", verbose_name="Error Message"),
),
- ("pid_issn", models.CharField(max_length=9, verbose_name="PID ISSN")),
- ("year_month_day", models.DateField(verbose_name="Date of access")),
(
- "print_issn",
- models.CharField(
- blank=True, max_length=9, null=True, verbose_name="Print ISSN"
+ "export_started_at",
+ models.DateTimeField(
+ blank=True,
+ null=True,
+ verbose_name="Export Started At",
),
),
(
- "online_issn",
- models.CharField(
- blank=True, max_length=9, null=True, verbose_name="Online ISSN"
- ),
+ "exported_at",
+ models.DateTimeField(blank=True, null=True, verbose_name="Exported At"),
),
(
"collection",
- models.CharField(max_length=3, verbose_name="Collection Acronym 3"),
- ),
- ("pid", models.CharField(verbose_name="Publication ID")),
- (
- "yop",
- models.PositiveSmallIntegerField(
- verbose_name="Year of Publication"
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to="collection.collection",
+ verbose_name="Collection",
),
),
- (
- "total_item_requests",
- models.IntegerField(verbose_name="Total Item Requests"),
- ),
- (
- "total_item_investigations",
- models.IntegerField(verbose_name="Total Item Investigations"),
- ),
- (
- "unique_item_requests",
- models.IntegerField(verbose_name="Unique Item Requests"),
- ),
- (
- "unique_item_investigations",
- models.IntegerField(verbose_name="Unique Item Investigations"),
- ),
(
"creator",
models.ForeignKey(
@@ -180,18 +132,23 @@ class Migration(migrations.Migration):
),
],
options={
- "verbose_name_plural": "Top 100 Articles",
- "indexes": [
- models.Index(
- fields=["pid_issn"], name="metrics_top_pid_iss_c1fba9_idx"
- ),
- models.Index(
- fields=["year_month_day"], name="metrics_top_year_mo_8cda7b_idx"
- ),
- ],
- "unique_together": {
- ("collection", "pid_issn", "pid", "year_month_day")
- },
+ "verbose_name": "Daily Metric Job",
+ "verbose_name_plural": "Daily Metric Jobs",
+ "unique_together": {("collection", "access_date")},
},
),
+ migrations.AddIndex(
+ model_name="dailymetricjob",
+ index=models.Index(
+ fields=["collection", "access_date"],
+ name="metrics_daily_coll_date_idx",
+ ),
+ ),
+ migrations.AddIndex(
+ model_name="dailymetricjob",
+ index=models.Index(
+ fields=["status", "export_started_at"],
+ name="metrics_daily_status_exp_idx",
+ ),
+ ),
]
diff --git a/metrics/migrations/0002_alter_top100articlesfile_status.py b/metrics/migrations/0002_alter_top100articlesfile_status.py
deleted file mode 100644
index b2b98c5..0000000
--- a/metrics/migrations/0002_alter_top100articlesfile_status.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Generated by Django 5.0.7 on 2024-08-30 21:27
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("metrics", "0001_initial"),
- ]
-
- operations = [
- migrations.AlterField(
- model_name="top100articlesfile",
- name="status",
- field=models.CharField(
- choices=[
- ("QUE", "Queued"),
- ("PAR", "Parsing"),
- ("PRO", "Processed"),
- ("ERR", "Error"),
- ("INV", "Invalidated"),
- ],
- default="QUE",
- max_length=5,
- ),
- ),
- ]
diff --git a/metrics/migrations/0003_remove_top100articlesfile_attachment_and_more.py b/metrics/migrations/0003_remove_top100articlesfile_attachment_and_more.py
deleted file mode 100644
index 8b01d80..0000000
--- a/metrics/migrations/0003_remove_top100articlesfile_attachment_and_more.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Generated by Django 5.0.7 on 2025-03-07 16:55
-
-import django.db.models.deletion
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("article", "0001_initial"),
- ("collection", "0001_initial"),
- ("journal", "0001_initial"),
- ("metrics", "0002_alter_top100articlesfile_status"),
- ]
-
- operations = [
- migrations.RemoveField(
- model_name="top100articlesfile",
- name="attachment",
- ),
- migrations.RemoveField(
- model_name="top100articlesfile",
- name="creator",
- ),
- migrations.RemoveField(
- model_name="top100articlesfile",
- name="updated_by",
- ),
- migrations.CreateModel(
- name="Item",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "article",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="article.article",
- verbose_name="Article",
- ),
- ),
- (
- "collection",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="collection.collection",
- verbose_name="Collection",
- ),
- ),
- (
- "journal",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="journal.journal",
- verbose_name="Journal",
- ),
- ),
- ],
- options={
- "verbose_name": "Item",
- "verbose_name_plural": "Items",
- },
- ),
- migrations.CreateModel(
- name="UserAgent",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "name",
- models.CharField(
- db_index=True, max_length=255, verbose_name="Name"
- ),
- ),
- (
- "version",
- models.CharField(
- db_index=True, max_length=255, verbose_name="Version"
- ),
- ),
- ],
- options={
- "verbose_name": "User Agent",
- "verbose_name_plural": "User Agents",
- "unique_together": {("name", "version")},
- },
- ),
- migrations.CreateModel(
- name="UserSession",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- ("datetime", models.DateTimeField(verbose_name="Datetime")),
- (
- "user_ip",
- models.CharField(
- db_index=True, max_length=255, verbose_name="User IP"
- ),
- ),
- (
- "user_agent",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="metrics.useragent",
- verbose_name="User Agent",
- ),
- ),
- ],
- options={
- "verbose_name": "User Session",
- "verbose_name_plural": "User Sessions",
- },
- ),
- migrations.CreateModel(
- name="ItemAccess",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "country_code",
- models.CharField(
- db_index=True, max_length=2, verbose_name="Country"
- ),
- ),
- (
- "media_language",
- models.CharField(
- db_index=True, max_length=2, verbose_name="Media Language"
- ),
- ),
- (
- "media_format",
- models.CharField(max_length=10, verbose_name="Media Format"),
- ),
- (
- "item",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="metrics.item",
- verbose_name="Item",
- ),
- ),
- (
- "user_session",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="metrics.usersession",
- verbose_name="User Session",
- ),
- ),
- ],
- options={
- "verbose_name": "Item Access",
- "verbose_name_plural": "Items Access",
- },
- ),
- migrations.DeleteModel(
- name="Top100Articles",
- ),
- ]
diff --git a/metrics/migrations/0004_delete_top100articlesfile_and_more.py b/metrics/migrations/0004_delete_top100articlesfile_and_more.py
deleted file mode 100644
index b10c41b..0000000
--- a/metrics/migrations/0004_delete_top100articlesfile_and_more.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Generated by Django 5.0.7 on 2025-03-07 16:55
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("metrics", "0003_remove_top100articlesfile_attachment_and_more"),
- ("tracker", "0003_logfilediscardedline_delete_top100articlesfileevent"),
- ]
-
- operations = [
- migrations.DeleteModel(
- name="Top100ArticlesFile",
- ),
- migrations.AddIndex(
- model_name="item",
- index=models.Index(
- fields=["collection", "journal", "article"],
- name="metrics_ite_collect_6971a5_idx",
- ),
- ),
- migrations.AddIndex(
- model_name="item",
- index=models.Index(
- fields=["collection", "journal"], name="metrics_ite_collect_b5f79b_idx"
- ),
- ),
- migrations.AlterUniqueTogether(
- name="item",
- unique_together={("collection", "journal", "article")},
- ),
- migrations.AlterUniqueTogether(
- name="usersession",
- unique_together={("datetime", "user_agent", "user_ip")},
- ),
- migrations.AlterUniqueTogether(
- name="itemaccess",
- unique_together={
- (
- "item",
- "user_session",
- "country_code",
- "media_format",
- "media_language",
- )
- },
- ),
- ]
diff --git a/metrics/migrations/0005_alter_itemaccess_unique_together_and_more.py b/metrics/migrations/0005_alter_itemaccess_unique_together_and_more.py
deleted file mode 100644
index 7bfafff..0000000
--- a/metrics/migrations/0005_alter_itemaccess_unique_together_and_more.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Generated by Django 5.0.7 on 2025-03-27 20:40
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("metrics", "0004_delete_top100articlesfile_and_more"),
- ]
-
- operations = [
- migrations.AlterUniqueTogether(
- name="itemaccess",
- unique_together=set(),
- ),
- migrations.AddField(
- model_name="itemaccess",
- name="click_timestamps",
- field=models.JSONField(default=dict, verbose_name="Click Timestamps"),
- ),
- migrations.AddField(
- model_name="itemaccess",
- name="content_type",
- field=models.CharField(
- default="undefined", max_length=16, verbose_name="Content Type"
- ),
- preserve_default=False,
- ),
- migrations.AlterField(
- model_name="itemaccess",
- name="media_format",
- field=models.CharField(
- db_index=True, max_length=10, verbose_name="Media Format"
- ),
- ),
- migrations.AlterUniqueTogether(
- name="itemaccess",
- unique_together={
- (
- "item",
- "user_session",
- "country_code",
- "media_format",
- "media_language",
- "content_type",
- )
- },
- ),
- ]
diff --git a/metrics/migrations/0006_alter_itemaccess_content_type.py b/metrics/migrations/0006_alter_itemaccess_content_type.py
deleted file mode 100644
index 0e81287..0000000
--- a/metrics/migrations/0006_alter_itemaccess_content_type.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Generated by Django 5.0.7 on 2025-03-31 21:07
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("metrics", "0005_alter_itemaccess_unique_together_and_more"),
- ]
-
- operations = [
- migrations.AlterField(
- model_name="itemaccess",
- name="content_type",
- field=models.CharField(max_length=32, verbose_name="Content Type"),
- ),
- ]
diff --git a/metrics/migrations/0007_alter_usersession_datetime_and_more.py b/metrics/migrations/0007_alter_usersession_datetime_and_more.py
deleted file mode 100644
index e45036e..0000000
--- a/metrics/migrations/0007_alter_usersession_datetime_and_more.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Generated by Django 5.0.7 on 2025-06-12 17:16
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("metrics", "0006_alter_itemaccess_content_type"),
- ]
-
- operations = [
- migrations.AlterField(
- model_name="usersession",
- name="datetime",
- field=models.DateTimeField(db_index=True, verbose_name="Datetime"),
- ),
- migrations.AddIndex(
- model_name="itemaccess",
- index=models.Index(
- fields=["item", "user_session"], name="metrics_ite_item_id_8799c9_idx"
- ),
- ),
- ]
diff --git a/metrics/migrations/0008_remove_a_few_models.py b/metrics/migrations/0008_remove_a_few_models.py
deleted file mode 100644
index dfd14ec..0000000
--- a/metrics/migrations/0008_remove_a_few_models.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Generated by Django 5.0.7 on 2025-06-22 17:45
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("metrics", "0007_alter_usersession_datetime_and_more"),
- ]
-
- operations = [
- migrations.AlterUniqueTogether(
- name="itemaccess",
- unique_together=None,
- ),
- migrations.AlterUniqueTogether(
- name="useragent",
- unique_together=None,
- ),
- migrations.AlterUniqueTogether(
- name="usersession",
- unique_together=None,
- ),
- migrations.RemoveField(
- model_name="itemaccess",
- name="user_session",
- ),
- migrations.RemoveField(
- model_name="usersession",
- name="user_agent",
- ),
- migrations.RemoveField(
- model_name="itemaccess",
- name="item",
- ),
- migrations.DeleteModel(
- name="Item",
- ),
- migrations.DeleteModel(
- name="ItemAccess",
- ),
- migrations.DeleteModel(
- name="UserAgent",
- ),
- migrations.DeleteModel(
- name="UserSession",
- ),
- ]
diff --git a/metrics/models.py b/metrics/models.py
index e69de29..aa789b5 100644
--- a/metrics/models.py
+++ b/metrics/models.py
@@ -0,0 +1,108 @@
+from django.db import models
+from django.utils.translation import gettext_lazy as _
+
+from collection.models import Collection
+from core.models import CommonControlField
+
+
+class DailyMetricJob(CommonControlField):
+ STATUS_PENDING = "PEN"
+ STATUS_EXPORTING = "EXP"
+ STATUS_EXPORTED = "SUC"
+ STATUS_ERROR = "ERR"
+ STATUS_CHOICES = (
+ (STATUS_PENDING, _("Pending")),
+ (STATUS_EXPORTING, _("Exporting")),
+ (STATUS_EXPORTED, _("Exported")),
+ (STATUS_ERROR, _("Error")),
+ )
+
+ collection = models.ForeignKey(
+ Collection,
+ verbose_name=_("Collection"),
+ on_delete=models.CASCADE,
+ db_index=True,
+ )
+
+ access_date = models.DateField(
+ verbose_name=_("Access Date"),
+ db_index=True,
+ )
+
+ status = models.CharField(
+ verbose_name=_("Status"),
+ max_length=3,
+ choices=STATUS_CHOICES,
+ default=STATUS_PENDING,
+ db_index=True,
+ )
+
+ input_log_hashes = models.JSONField(
+ verbose_name=_("Input Log Hashes"),
+ default=list,
+ )
+
+ storage_path = models.CharField(
+ verbose_name=_("Storage Path"),
+ max_length=500,
+ blank=True,
+ default="",
+ )
+
+ payload_hash = models.CharField(
+ verbose_name=_("Payload Hash"),
+ max_length=64,
+ blank=True,
+ default="",
+ )
+
+ summary = models.JSONField(
+ verbose_name=_("Summary"),
+ default=dict,
+ blank=True,
+ )
+
+ attempts = models.PositiveIntegerField(
+ verbose_name=_("Attempts"),
+ default=0,
+ )
+
+ error_message = models.TextField(
+ verbose_name=_("Error Message"),
+ blank=True,
+ default="",
+ )
+
+ export_started_at = models.DateTimeField(
+ verbose_name=_("Export Started At"),
+ null=True,
+ blank=True,
+ )
+
+ exported_at = models.DateTimeField(
+ verbose_name=_("Exported At"),
+ null=True,
+ blank=True,
+ )
+
+ @property
+ def input_log_count(self):
+ return len(self.input_log_hashes or [])
+
+ @property
+ def job_id(self):
+ if not self.payload_hash:
+ return ""
+ return f"{self.collection.acron3}|{self.access_date.isoformat()}|{self.payload_hash}"
+
+ class Meta:
+ verbose_name = _("Daily Metric Job")
+ verbose_name_plural = _("Daily Metric Jobs")
+ unique_together = (("collection", "access_date"),)
+ indexes = [
+ models.Index(fields=["collection", "access_date"], name="metrics_daily_coll_date_idx"),
+ models.Index(fields=["status", "export_started_at"], name="metrics_daily_status_exp_idx"),
+ ]
+
+ def __str__(self):
+ return f"{self.collection.acron3}-{self.access_date}"
diff --git a/metrics/templates/search/indexes/metrics/top100articles_text.txt b/metrics/templates/search/indexes/metrics/top100articles_text.txt
deleted file mode 100644
index ccf5e94..0000000
--- a/metrics/templates/search/indexes/metrics/top100articles_text.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-{{ object.collection }}
-{{ object.key_issn }}
-{{ object.pid }}
-{{ object.yop }}
-{{ object.language }}
-{{ object.country }}
-{{ object.total_item_requests }}
-{{ object.total_item_investigations }}
-{{ object.unique_item_requests }}
-{{ object.unique_item_investigations }}
\ No newline at end of file
diff --git a/metrics/wagtail_hooks.py b/metrics/wagtail_hooks.py
new file mode 100644
index 0000000..94c2ffb
--- /dev/null
+++ b/metrics/wagtail_hooks.py
@@ -0,0 +1,22 @@
+from django.utils.translation import gettext_lazy as _
+from wagtail.snippets.views.snippets import SnippetViewSet
+
+from metrics.models import DailyMetricJob
+
+class DailyMetricJobSnippetViewSet(SnippetViewSet):
+ model = DailyMetricJob
+ menu_label = _("Daily Metric Jobs")
+ icon = "history"
+ menu_order = 600
+ list_display = (
+ "collection",
+ "access_date",
+ "status",
+ "input_log_count",
+ "attempts",
+ "export_started_at",
+ "exported_at",
+ "updated",
+ )
+ list_filter = ("status", "collection", "access_date")
+ search_fields = ("collection__acron3", "error_message")
diff --git a/article/__init__.py b/reports/__init__.py
similarity index 100%
rename from article/__init__.py
rename to reports/__init__.py
diff --git a/reports/apps.py b/reports/apps.py
new file mode 100644
index 0000000..119ca26
--- /dev/null
+++ b/reports/apps.py
@@ -0,0 +1,8 @@
+from django.apps import AppConfig
+from django.utils.translation import gettext_lazy as _
+
+
+class ReportsConfig(AppConfig):
+ default_auto_field = "django.db.models.BigAutoField"
+ name = "reports"
+ verbose_name = _("Reports")
diff --git a/reports/migrations/0001_initial.py b/reports/migrations/0001_initial.py
new file mode 100644
index 0000000..2a72923
--- /dev/null
+++ b/reports/migrations/0001_initial.py
@@ -0,0 +1,140 @@
+# Generated by Django 5.2.12 on 2026-05-01 15:50
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ initial = True
+
+ dependencies = [
+ ("collection", "0001_initial"),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name="MonthlyLogReport",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ ("total_files", models.IntegerField(default=0)),
+ ("created_files", models.IntegerField(default=0)),
+ ("validated_files", models.IntegerField(default=0)),
+ ("invalidated_files", models.IntegerField(default=0)),
+ ("errored_files", models.IntegerField(default=0)),
+ ("lines_parsed", models.IntegerField(default=0)),
+ ("valid_lines", models.IntegerField(default=0)),
+ ("discarded_lines", models.IntegerField(default=0)),
+ ("ip_local_count", models.IntegerField(default=0)),
+ ("ip_remote_count", models.IntegerField(default=0)),
+ ("ip_unknown_count", models.IntegerField(default=0)),
+ ("generated_at", models.DateTimeField(auto_now=True)),
+ ("year", models.IntegerField(verbose_name="Year")),
+ ("month", models.IntegerField(verbose_name="Month")),
+ (
+ "collection",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to="collection.collection",
+ verbose_name="Collection",
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "Monthly Log Report",
+ "verbose_name_plural": "Monthly Log Reports",
+ "ordering": ["-year", "-month", "collection__acron3"],
+ "unique_together": {("collection", "year", "month")},
+ },
+ ),
+ migrations.CreateModel(
+ name="WeeklyLogReport",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ ("total_files", models.IntegerField(default=0)),
+ ("created_files", models.IntegerField(default=0)),
+ ("validated_files", models.IntegerField(default=0)),
+ ("invalidated_files", models.IntegerField(default=0)),
+ ("errored_files", models.IntegerField(default=0)),
+ ("lines_parsed", models.IntegerField(default=0)),
+ ("valid_lines", models.IntegerField(default=0)),
+ ("discarded_lines", models.IntegerField(default=0)),
+ ("ip_local_count", models.IntegerField(default=0)),
+ ("ip_remote_count", models.IntegerField(default=0)),
+ ("ip_unknown_count", models.IntegerField(default=0)),
+ ("generated_at", models.DateTimeField(auto_now=True)),
+ ("year", models.IntegerField(verbose_name="Year")),
+ ("week", models.IntegerField(verbose_name="ISO Week")),
+ (
+ "collection",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to="collection.collection",
+ verbose_name="Collection",
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "Weekly Log Report",
+ "verbose_name_plural": "Weekly Log Reports",
+ "ordering": ["-year", "-week", "collection__acron3"],
+ "unique_together": {("collection", "year", "week")},
+ },
+ ),
+ migrations.CreateModel(
+ name="YearlyLogReport",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ ("total_files", models.IntegerField(default=0)),
+ ("created_files", models.IntegerField(default=0)),
+ ("validated_files", models.IntegerField(default=0)),
+ ("invalidated_files", models.IntegerField(default=0)),
+ ("errored_files", models.IntegerField(default=0)),
+ ("lines_parsed", models.IntegerField(default=0)),
+ ("valid_lines", models.IntegerField(default=0)),
+ ("discarded_lines", models.IntegerField(default=0)),
+ ("ip_local_count", models.IntegerField(default=0)),
+ ("ip_remote_count", models.IntegerField(default=0)),
+ ("ip_unknown_count", models.IntegerField(default=0)),
+ ("generated_at", models.DateTimeField(auto_now=True)),
+ ("year", models.IntegerField(verbose_name="Year")),
+ (
+ "collection",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to="collection.collection",
+ verbose_name="Collection",
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "Yearly Log Report",
+ "verbose_name_plural": "Yearly Log Reports",
+ "ordering": ["-year", "collection__acron3"],
+ "unique_together": {("collection", "year")},
+ },
+ ),
+ ]
diff --git a/reports/migrations/0002_alter_monthlylogreport_options_and_more.py b/reports/migrations/0002_alter_monthlylogreport_options_and_more.py
new file mode 100644
index 0000000..659215c
--- /dev/null
+++ b/reports/migrations/0002_alter_monthlylogreport_options_and_more.py
@@ -0,0 +1,36 @@
+# Generated by Django 5.2.12 on 2026-05-01 22:23
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("reports", "0001_initial"),
+ ]
+
+ operations = [
+ migrations.AlterModelOptions(
+ name="monthlylogreport",
+ options={
+ "ordering": ["collection__acron3", "year", "month"],
+ "verbose_name": "Monthly Log Report",
+ "verbose_name_plural": "Monthly Log Reports",
+ },
+ ),
+ migrations.AlterModelOptions(
+ name="weeklylogreport",
+ options={
+ "ordering": ["collection__acron3", "year", "week"],
+ "verbose_name": "Weekly Log Report",
+ "verbose_name_plural": "Weekly Log Reports",
+ },
+ ),
+ migrations.AlterModelOptions(
+ name="yearlylogreport",
+ options={
+ "ordering": ["collection__acron3", "year"],
+ "verbose_name": "Yearly Log Report",
+ "verbose_name_plural": "Yearly Log Reports",
+ },
+ ),
+ ]
diff --git a/article/management/__init__.py b/reports/migrations/__init__.py
similarity index 100%
rename from article/management/__init__.py
rename to reports/migrations/__init__.py
diff --git a/reports/models.py b/reports/models.py
new file mode 100644
index 0000000..3af1ec8
--- /dev/null
+++ b/reports/models.py
@@ -0,0 +1,100 @@
+from django.db import models
+from django.utils.translation import gettext_lazy as _
+
+from collection.models import Collection
+
+
+class AbstractLogReport(models.Model):
+ collection = models.ForeignKey(
+ Collection,
+ on_delete=models.CASCADE,
+ verbose_name=_("Collection"),
+ )
+ total_files = models.IntegerField(default=0)
+ created_files = models.IntegerField(default=0)
+ validated_files = models.IntegerField(default=0)
+ invalidated_files = models.IntegerField(default=0)
+ errored_files = models.IntegerField(default=0)
+ lines_parsed = models.IntegerField(default=0)
+ valid_lines = models.IntegerField(default=0)
+ discarded_lines = models.IntegerField(default=0)
+ ip_local_count = models.IntegerField(default=0)
+ ip_remote_count = models.IntegerField(default=0)
+ ip_unknown_count = models.IntegerField(default=0)
+ generated_at = models.DateTimeField(auto_now=True)
+
+ class Meta:
+ abstract = True
+
+ @property
+ def pct_validated(self):
+ if not self.total_files:
+ return 0
+ return round(self.validated_files / self.total_files * 100, 1)
+ pct_validated.fget.short_description = _("% Valid Files")
+
+ @property
+ def pct_valid_lines(self):
+ if not self.lines_parsed:
+ return 0
+ return round(self.valid_lines / self.lines_parsed * 100, 1)
+ pct_valid_lines.fget.short_description = _("% Valid Lines")
+
+ @property
+ def pct_remote_ip(self):
+ total = self.ip_remote_count + self.ip_local_count
+ if not total:
+ return 0
+ return round(self.ip_remote_count / total * 100, 1)
+ pct_remote_ip.fget.short_description = _("% Remote IP")
+
+ def __str__(self):
+ return f"{self.collection.acron3} {self.period_label}"
+
+ @property
+ def period_label(self):
+ raise NotImplementedError
+
+
+class WeeklyLogReport(AbstractLogReport):
+ year = models.IntegerField(verbose_name=_("Year"))
+ week = models.IntegerField(verbose_name=_("ISO Week"))
+
+ class Meta:
+ unique_together = [("collection", "year", "week")]
+ ordering = ["collection__acron3", "year", "week"]
+ verbose_name = _("Weekly Log Report")
+ verbose_name_plural = _("Weekly Log Reports")
+
+ @property
+ def period_label(self):
+ return f"{self.year}-W{self.week:02d}"
+
+
+class MonthlyLogReport(AbstractLogReport):
+ year = models.IntegerField(verbose_name=_("Year"))
+ month = models.IntegerField(verbose_name=_("Month"))
+
+ class Meta:
+ unique_together = [("collection", "year", "month")]
+ ordering = ["collection__acron3", "year", "month"]
+ verbose_name = _("Monthly Log Report")
+ verbose_name_plural = _("Monthly Log Reports")
+
+ @property
+ def period_label(self):
+ return f"{self.year}-{self.month:02d}"
+
+
+class YearlyLogReport(AbstractLogReport):
+ year = models.IntegerField(verbose_name=_("Year"))
+
+ class Meta:
+ unique_together = [("collection", "year")]
+ ordering = ["collection__acron3", "year"]
+ verbose_name = _("Yearly Log Report")
+ verbose_name_plural = _("Yearly Log Reports")
+
+ @property
+ def period_label(self):
+ return str(self.year)
diff --git a/reports/tasks.py b/reports/tasks.py
new file mode 100644
index 0000000..69a53a1
--- /dev/null
+++ b/reports/tasks.py
@@ -0,0 +1,238 @@
+import logging
+from collections import defaultdict
+
+from django.core.mail import send_mail
+from django.conf import settings
+from django.utils.translation import gettext as _
+
+from config import celery_app
+from core.utils import date_utils
+from collection.models import Collection
+from log_manager import choices
+from log_manager.models import LogFile
+from log_manager_config import models as lmc_models
+
+from reports.models import WeeklyLogReport, MonthlyLogReport, YearlyLogReport
+
+
+def _extract_date_from_log_file(lf):
+ if lf.date:
+ return lf.date
+
+ probably_date = (lf.validation or {}).get("probably_date")
+ if isinstance(probably_date, str) and probably_date:
+ return date_utils.get_date_obj(probably_date)
+
+ try:
+ import re
+ match = re.search(r"(\d{4}-\d{2}-\d{2})", lf.path)
+ if match:
+ return date_utils.get_date_obj(match.group(1))
+ except Exception:
+ pass
+
+ return None
+
+
+@celery_app.task(bind=True, name=_("[Reports] Populate All Reports"))
+def task_populate_all_reports(self, year=None, collection_acron=None):
+ qs = LogFile.objects.select_related("collection")
+ if collection_acron:
+ qs = qs.filter(collection__acron3=collection_acron)
+ qs = qs.only(
+ "id", "collection_id", "date", "path", "status", "summary", "validation"
+ )
+
+ weekly = defaultdict(lambda: defaultdict(int))
+ monthly = defaultdict(lambda: defaultdict(int))
+ yearly = defaultdict(lambda: defaultdict(int))
+
+ for lf in qs.iterator(chunk_size=2000):
+ extracted_date = _extract_date_from_log_file(lf)
+ if not extracted_date:
+ continue
+ if year and extracted_date.year != int(year):
+ continue
+
+ iso_year, iso_week, _ = extracted_date.isocalendar()
+ yr = extracted_date.year
+ mo = extracted_date.month
+
+ for agg, key in [
+ (weekly, (lf.collection_id, iso_year, iso_week)),
+ (monthly, (lf.collection_id, yr, mo)),
+ (yearly, (lf.collection_id, yr)),
+ ]:
+ r = agg[key]
+ r["total_files"] += 1
+ st = lf.status
+ if st == "CRE":
+ r["created_files"] += 1
+ elif st in ("QUE", "PAR", "PRO"):
+ r["validated_files"] += 1
+ elif st == "INV":
+ r["invalidated_files"] += 1
+ elif st == "ERR":
+ r["errored_files"] += 1
+
+ s = lf.summary or {}
+ lp = s.get("lines_parsed", 0) or 0
+ vl = s.get("valid_lines", 0) or 0
+ r["lines_parsed"] += lp
+ r["valid_lines"] += vl
+ r["discarded_lines"] += max(lp - vl, 0)
+
+ ips = (
+ (lf.validation or {})
+ .get("content", {})
+ .get("summary", {})
+ .get("ips", {})
+ )
+ r["ip_local_count"] += ips.get("local", 0) or 0
+ r["ip_remote_count"] += ips.get("remote", 0) or 0
+ r["ip_unknown_count"] += ips.get("unknown", 0) or 0
+
+ w_count = _upsert_reports(WeeklyLogReport, weekly)
+ m_count = _upsert_reports(MonthlyLogReport, monthly)
+ y_count = _upsert_reports(YearlyLogReport, yearly)
+
+ logging.info(
+ "Reports populated: %s weekly, %s monthly, %s yearly.",
+ w_count, m_count, y_count,
+ )
+ return f"Weekly: {w_count}, Monthly: {m_count}, Yearly: {y_count}"
+
+
+def _upsert_reports(model_class, data):
+ count = 0
+ unique_fields = list(model_class._meta.unique_together[0])
+ period_fields = unique_fields[1:]
+ for key, fields in data.items():
+ coll_id = key[0]
+ period_values = key[1:]
+ lookup = {"collection_id": coll_id}
+ for idx, field_name in enumerate(period_fields):
+ lookup[field_name] = period_values[idx]
+ model_class.objects.update_or_create(defaults=fields, **lookup)
+ count += 1
+ return count
+
+
+@celery_app.task(
+ bind=True,
+ name=_("[Reports] Generate Log Report Summary (Manual)"),
+ queue="load",
+)
+def task_log_files_count_status_report(
+ self,
+ collections=None,
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ user_id=None,
+ username=None,
+):
+ from_date_str, until_date_str = date_utils.get_date_range_str(
+ from_date, until_date, days_to_go_back
+ )
+ subject = _(
+ "Usage Log Report Summary "
+ f"({from_date_str} to {until_date_str})"
+ )
+
+ for collection_acron in (collections or Collection.acron3_list()):
+ try:
+ collection = Collection.objects.get(acron3=collection_acron)
+ except Collection.DoesNotExist:
+ logging.warning("Collection not found: %s", collection_acron)
+ continue
+
+ message = _build_report_message(
+ collection,
+ from_date_str,
+ until_date_str,
+ )
+
+ if not message:
+ continue
+
+ logging.info(
+ "Sending email to collection %s. Subject: %s.",
+ collection.main_name, subject,
+ )
+
+ _send_collection_email(subject, message, collection_acron)
+
+
+def _build_report_message(collection, from_date_str, until_date_str):
+ monthly = MonthlyLogReport.objects.filter(
+ collection=collection,
+ ).order_by("-year", "-month")
+
+ if not monthly.exists():
+ return ""
+
+ latest = monthly.first()
+ message = _(
+ f"Usage Log Report for {collection.acron3}\n"
+ f"Period: {from_date_str} to {until_date_str}\n\n"
+ )
+ message += _("Latest month ({latest}):\n").format(latest=latest.period_label)
+ message += (
+ f" Total files: {latest.total_files}\n"
+ f" Validated files: {latest.validated_files} ({latest.pct_validated}%)\n"
+ f" Invalidated files: {latest.invalidated_files}\n"
+ f" Errored files: {latest.errored_files}\n"
+ f" Lines parsed: {latest.lines_parsed}\n"
+ f" Valid lines: {latest.valid_lines} ({latest.pct_valid_lines}%)\n"
+ f" Discarded lines: {latest.discarded_lines}\n"
+ f" Remote IPs: {latest.ip_remote_count} ({latest.pct_remote_ip}%)\n"
+ f" Local IPs: {latest.ip_local_count}\n"
+ )
+
+ prev_month = latest
+ if len(monthly) > 1:
+ prev_month = monthly[1]
+ message += _("\nPrevious month ({prev}):\n").format(prev=prev_month.period_label)
+ message += (
+ f" Total files: {prev_month.total_files}\n"
+ f" Validated files: {prev_month.validated_files} ({prev_month.pct_validated}%)\n"
+ f" Valid lines: {prev_month.valid_lines} ({prev_month.pct_valid_lines}%)\n"
+ f" Remote IPs: {prev_month.ip_remote_count} ({prev_month.pct_remote_ip}%)\n"
+ )
+
+ if prev_month.total_files:
+ file_diff = latest.total_files - prev_month.total_files
+ line_diff = latest.lines_parsed - prev_month.lines_parsed
+ message += _("\nMonth-over-month change:\n")
+ message += f" Files: {file_diff:+d}\n"
+ message += f" Lines: {line_diff:+d}\n"
+
+ message += (
+ f"\n---\n"
+ f"This report is automatically generated by SciELO Usage.\n"
+ )
+ return message
+
+
+def _send_collection_email(subject, message, collection):
+ emails = lmc_models.CollectionEmail.objects.filter(
+ config__collection__acron3=collection, active=True
+ ).values_list("email", flat=True)
+
+ if not emails:
+ logging.error(
+ "Error. Please, add an E-mail Configuration for the collection %s.",
+ collection,
+ )
+ return
+
+ try:
+ send_mail(
+ subject=subject,
+ message=message,
+ from_email=settings.DEFAULT_FROM_EMAIL,
+ recipient_list=list(emails),
+ )
+ except Exception as e:
+ logging.error("Error sending log files report for %s: %s", collection, e)
diff --git a/reports/wagtail_hooks.py b/reports/wagtail_hooks.py
new file mode 100644
index 0000000..b2aeac7
--- /dev/null
+++ b/reports/wagtail_hooks.py
@@ -0,0 +1,75 @@
+from django.contrib.auth import get_user_model
+from django.utils.translation import gettext_lazy as _
+from wagtail.snippets.views.snippets import SnippetViewSet, SnippetViewSetGroup
+from wagtail.snippets.models import register_snippet
+from wagtail.permission_policies.base import BasePermissionPolicy
+
+from reports.models import WeeklyLogReport, MonthlyLogReport, YearlyLogReport
+
+
+class ReadOnlyPermissionPolicy(BasePermissionPolicy):
+ def user_has_permission(self, user, action):
+ if action in ("add", "change", "delete"):
+ return False
+ return True
+
+ def users_with_any_permission(self, actions):
+ return get_user_model().objects.filter(is_active=True)
+
+
+COMMON_LIST_DISPLAY = (
+ "total_files",
+ "pct_validated",
+ "lines_parsed",
+ "pct_valid_lines",
+ "pct_remote_ip",
+ "generated_at",
+)
+
+
+class WeeklyLogReportSnippetViewSet(SnippetViewSet):
+ model = WeeklyLogReport
+ menu_label = _("Weekly")
+ icon = "info-circle"
+ menu_order = 100
+ list_display = ("collection", "year", "week") + COMMON_LIST_DISPLAY
+ list_filter = ("collection", "year", "week")
+ search_fields = ("collection__acron3",)
+ permission_policy = ReadOnlyPermissionPolicy(WeeklyLogReport)
+
+
+class MonthlyLogReportSnippetViewSet(SnippetViewSet):
+ model = MonthlyLogReport
+ menu_label = _("Monthly")
+ icon = "info-circle"
+ menu_order = 200
+ list_display = ("collection", "year", "month") + COMMON_LIST_DISPLAY
+ list_filter = ("collection", "year", "month")
+ search_fields = ("collection__acron3",)
+ permission_policy = ReadOnlyPermissionPolicy(MonthlyLogReport)
+
+
+class YearlyLogReportSnippetViewSet(SnippetViewSet):
+ model = YearlyLogReport
+ menu_label = _("Yearly")
+ icon = "info-circle"
+ menu_order = 300
+ list_display = ("collection", "year") + COMMON_LIST_DISPLAY
+ list_filter = ("collection", "year")
+ search_fields = ("collection__acron3",)
+ permission_policy = ReadOnlyPermissionPolicy(YearlyLogReport)
+
+
+class ReportsSnippetViewSetGroup(SnippetViewSetGroup):
+ menu_name = "usage_reports"
+ menu_label = _("Reports")
+ menu_icon = "info-circle"
+ menu_order = 350
+ items = (
+ WeeklyLogReportSnippetViewSet,
+ MonthlyLogReportSnippetViewSet,
+ YearlyLogReportSnippetViewSet,
+ )
+
+
+register_snippet(ReportsSnippetViewSetGroup)
diff --git a/resources/constants.py b/resources/constants.py
index feba18d..2ce64da 100644
--- a/resources/constants.py
+++ b/resources/constants.py
@@ -1,2 +1,2 @@
DEFAULT_COUNTER_ROBOTS_URL = 'https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json'
-DEFAULT_MMDB_URL = 'https://download.db-ip.com/free/dbip-country-lite-2025-02.mmdb.gz'
+DEFAULT_MMDB_URL = 'https://download.db-ip.com/free/dbip-country-lite-2026-03.mmdb.gz'
diff --git a/resources/migrations/0002_remove_mmdb_creator_remove_mmdb_updated_by_and_more.py b/resources/migrations/0002_remove_mmdb_creator_remove_mmdb_updated_by_and_more.py
new file mode 100644
index 0000000..80bb0cc
--- /dev/null
+++ b/resources/migrations/0002_remove_mmdb_creator_remove_mmdb_updated_by_and_more.py
@@ -0,0 +1,61 @@
+# Generated by Django 5.2.12 on 2026-05-01 22:23
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("resources", "0001_initial"),
+ ]
+
+ operations = [
+ migrations.RemoveField(
+ model_name="mmdb",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="mmdb",
+ name="updated_by",
+ ),
+ migrations.RemoveField(
+ model_name="robotuseragent",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="robotuseragent",
+ name="updated_by",
+ ),
+ migrations.AddField(
+ model_name="robotuseragent",
+ name="is_active",
+ field=models.BooleanField(
+ db_index=True, default=True, verbose_name="Active"
+ ),
+ ),
+ migrations.AddField(
+ model_name="robotuseragent",
+ name="source_counter",
+ field=models.BooleanField(
+ db_index=True, default=False, verbose_name="From Atmire/COUNTER"
+ ),
+ ),
+ migrations.AddField(
+ model_name="robotuseragent",
+ name="source_scielo",
+ field=models.BooleanField(
+ db_index=True, default=False, verbose_name="From SciELO"
+ ),
+ ),
+ migrations.AddField(
+ model_name="robotuseragent",
+ name="source_url",
+ field=models.URLField(
+ blank=True, max_length=255, null=True, verbose_name="Source URL"
+ ),
+ ),
+ migrations.AlterField(
+ model_name="robotuseragent",
+ name="last_changed",
+ field=models.DateField(blank=True, null=True, verbose_name="Last Changed"),
+ ),
+ ]
diff --git a/resources/models.py b/resources/models.py
index a30b8d3..22663e2 100644
--- a/resources/models.py
+++ b/resources/models.py
@@ -2,11 +2,26 @@
from django.db import models
from django.utils.translation import gettext_lazy as _
+from wagtail.admin.panels import FieldPanel
-from core.models import CommonControlField
+class RobotUserAgent(models.Model):
+ SOURCE_ALL = "all"
+ SOURCE_COUNTER = "counter"
+ SOURCE_SCIELO = "scielo"
+ SOURCE_CHOICES = [SOURCE_ALL, SOURCE_COUNTER, SOURCE_SCIELO]
+ panels = [
+ FieldPanel("pattern"),
+ FieldPanel("source_counter"),
+ FieldPanel("source_scielo"),
+ FieldPanel("is_active"),
+ FieldPanel("source_url"),
+ FieldPanel("last_changed"),
+ ]
+
+ created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True)
+ updated = models.DateTimeField(verbose_name=_("Last update date"), auto_now=True)
-class RobotUserAgent(CommonControlField):
pattern = models.CharField(
verbose_name=_('Pattern'),
max_length=255,
@@ -14,21 +29,77 @@ class RobotUserAgent(CommonControlField):
blank=False,
primary_key=True,
)
+ source_counter = models.BooleanField(
+ verbose_name=_("From Atmire/COUNTER"),
+ default=False,
+ db_index=True,
+ )
+ source_scielo = models.BooleanField(
+ verbose_name=_("From SciELO"),
+ default=False,
+ db_index=True,
+ )
+ is_active = models.BooleanField(
+ verbose_name=_("Active"),
+ default=True,
+ db_index=True,
+ )
+ source_url = models.URLField(
+ verbose_name=_("Source URL"),
+ max_length=255,
+ null=True,
+ blank=True,
+ )
last_changed = models.DateField(
verbose_name=_('Last Changed'),
- null=False,
- blank=False,
+ null=True,
+ blank=True,
)
@classmethod
def get_all_patterns(cls):
- return cls.objects.values_list('pattern', flat=True)
+ return cls.get_patterns(source=cls.SOURCE_ALL)
+
+ @classmethod
+ def normalize_source(cls, source=None):
+ normalized = (source or cls.SOURCE_ALL).lower()
+ if normalized not in cls.SOURCE_CHOICES:
+ raise ValueError(f"Unsupported robots source: {source}")
+ return normalized
+
+ @classmethod
+ def get_patterns(cls, source=None):
+ source = cls.normalize_source(source)
+ queryset = cls.objects.filter(is_active=True)
+
+ if source == cls.SOURCE_COUNTER:
+ queryset = queryset.filter(source_counter=True)
+ elif source == cls.SOURCE_SCIELO:
+ queryset = queryset.filter(source_scielo=True)
+
+ return queryset.values_list("pattern", flat=True)
+
+ @property
+ def source_labels(self):
+ labels = []
+ if self.source_counter:
+ labels.append("Atmire/COUNTER")
+ if self.source_scielo:
+ labels.append("SciELO")
+ return ", ".join(labels) or "-"
+
+ def save(self, *args, **kwargs):
+ if not self.source_counter and not self.source_scielo:
+ self.source_scielo = True
+ super().save(*args, **kwargs)
def __str__(self):
return self.pattern
-class MMDB(CommonControlField):
+class MMDB(models.Model):
+ created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True)
+ updated = models.DateTimeField(verbose_name=_("Last update date"), auto_now=True)
id = models.CharField(
verbose_name=_('ID (HASH)'),
max_length=64,
diff --git a/resources/tasks.py b/resources/tasks.py
index e67cea1..4df60a9 100644
--- a/resources/tasks.py
+++ b/resources/tasks.py
@@ -1,19 +1,13 @@
import logging
-from django.contrib.auth import get_user_model
-from django.utils import timezone
from django.utils.translation import gettext as _
from config import celery_app
-from core.utils.utils import _get_user
from . import constants, models, utils
-
-User = get_user_model()
-
-@celery_app.task(bind=True, name=_('Load robots data'))
-def task_load_robots(self, url_robots=None, user_id=None, username=None):
+@celery_app.task(bind=True, name=_('[Resources] Load Robots Data'))
+def task_load_robots(self, url_robots=None):
"""
Load robots from a given URL and save them to the database.
This function fetches robot data from a specified URL (or a default URL if none is provided),
@@ -32,8 +26,6 @@ def task_load_robots(self, url_robots=None, user_id=None, username=None):
- Error if there is an issue downloading or saving the robots.
- Debug information for each robot saved.
"""
- user = _get_user(self.request, username=username, user_id=user_id)
-
if not url_robots:
url_robots = constants.DEFAULT_COUNTER_ROBOTS_URL
logging.warning(f'No robots URL provided. Using default: {url_robots}')
@@ -45,43 +37,63 @@ def task_load_robots(self, url_robots=None, user_id=None, username=None):
return False
cleaned_robots_data = utils.clean_robots_list(robots_data)
+ fetched_patterns = set()
try:
for r_str in cleaned_robots_data:
pattern = r_str.get('pattern')
last_changed = r_str.get('last_changed')
+ fetched_patterns.add(pattern)
- r_obj, created = models.RobotUserAgent.objects.get_or_create(pattern=pattern, last_changed=last_changed)
+ r_obj = models.RobotUserAgent.objects.filter(pattern=pattern).first()
+ created = r_obj is None
if created:
- r_obj.creator = user
-
- r_obj.updated = timezone.now()
- r_obj.updated_by = user
+ r_obj = models.RobotUserAgent(
+ pattern=pattern,
+ source_counter=True,
+ source_scielo=False,
+ )
+ r_obj.source_counter = True
+ r_obj.is_active = True
+ r_obj.source_url = url_robots
+ r_obj.last_changed = last_changed
r_obj.save()
logging.debug(f'Robot saved: {r_obj}')
+
+ stale_counter_patterns = models.RobotUserAgent.objects.filter(
+ source_counter=True
+ ).exclude(pattern__in=fetched_patterns)
+
+ for r_obj in stale_counter_patterns:
+ r_obj.source_counter = False
+ r_obj.source_url = None
+ r_obj.last_changed = None
+ if not r_obj.source_scielo:
+ r_obj.is_active = False
+ r_obj.save()
+ logging.debug(f'Robot deactivated or detached from COUNTER source: {r_obj}')
+
return True
except Exception as e:
logging.error(f'Error saving robots: {e}')
+ return False
-@celery_app.task(bind=True, name=_('Load geolocation and country data'))
-def task_load_geoip(self, url_geoip=None, user_id=None, username=None, validate=True):
+@celery_app.task(bind=True, name=_('[Resources] Load Geolocation Data'))
+def task_load_geoip(self, url_geoip=None, validate=True):
"""
Load GeoIP data from a specified URL, validate it, and save it to the database.
Args:
url_geoip (str, optional): The URL to download the GeoIP data from. Defaults to None.
- user_id (int, optional): The ID of the user performing the task. Defaults to None.
- username (str, optional): The username of the user performing the task. Defaults to None.
validate (bool, optional): Whether to validate the GeoIP data. Defaults to True.
Returns:
bool: True if the GeoIP data was successfully loaded and saved, False otherwise.
Raises:
Exception: If there is an error downloading, decompressing, or validating the GeoIP data.
"""
- user = _get_user(self.request, username=username, user_id=user_id)
if not url_geoip:
url_geoip = constants.DEFAULT_MMDB_URL
@@ -115,10 +127,6 @@ def task_load_geoip(self, url_geoip=None, user_id=None, username=None, validate=
except models.MMDB.DoesNotExist:
mmdb_obj = models.MMDB.objects.create(id=mmdb_hash, data=mmdb_data)
mmdb_obj.url = url_geoip or constants.DEFAULT_MMDB_URL
- mmdb_obj.creator = user
-
- mmdb_obj.updated = timezone.now()
- mmdb_obj.updated_by = user
mmdb_obj.save()
logging.debug(f'GeoIP data has been saved: {mmdb_obj}')
diff --git a/resources/tests.py b/resources/tests.py
deleted file mode 100644
index 7ce503c..0000000
--- a/resources/tests.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.test import TestCase
-
-# Create your tests here.
diff --git a/resources/wagtail_hooks.py b/resources/wagtail_hooks.py
index 758bb53..c347b22 100644
--- a/resources/wagtail_hooks.py
+++ b/resources/wagtail_hooks.py
@@ -15,13 +15,25 @@ class RobotUserAgentSnippetViewSet(SnippetViewSet):
list_display = (
"pattern",
+ "source_labels",
+ "is_active",
"last_changed",
)
search_fields = (
"pattern",
+ "source_url",
+ )
+ list_filter = (
+ "source_counter",
+ "source_scielo",
+ "is_active",
)
list_export = (
"pattern",
+ "source_counter",
+ "source_scielo",
+ "is_active",
+ "source_url",
"last_changed",
)
export_filename = "robots"
diff --git a/source/__init__.py b/source/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/source/__init__.py
@@ -0,0 +1 @@
+
diff --git a/article/apps.py b/source/apps.py
similarity index 63%
rename from article/apps.py
rename to source/apps.py
index 8c0e2c9..06d886d 100644
--- a/article/apps.py
+++ b/source/apps.py
@@ -1,6 +1,6 @@
from django.apps import AppConfig
-class ArticleConfig(AppConfig):
+class SourceConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
- name = "article"
+ name = "source"
diff --git a/source/migrations/0001_initial.py b/source/migrations/0001_initial.py
new file mode 100644
index 0000000..cc736e3
--- /dev/null
+++ b/source/migrations/0001_initial.py
@@ -0,0 +1,210 @@
+# Generated by Django 5.0.7 on 2026-03-15 00:00
+
+import django.db.models.deletion
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ initial = True
+
+ dependencies = [
+ ("collection", "0001_initial"),
+ migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name="Source",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ (
+ "created",
+ models.DateTimeField(
+ auto_now_add=True,
+ verbose_name="Creation date",
+ ),
+ ),
+ (
+ "updated",
+ models.DateTimeField(
+ auto_now=True,
+ verbose_name="Last update date",
+ ),
+ ),
+ (
+ "source_type",
+ models.CharField(
+ choices=[
+ ("journal", "Journal"),
+ ("book", "Book"),
+ ("preprint_server", "Preprint Server"),
+ ("data_repository", "Data Repository"),
+ ("other", "Other"),
+ ],
+ db_index=True,
+ max_length=32,
+ verbose_name="Source Type",
+ ),
+ ),
+ (
+ "source_id",
+ models.CharField(
+ db_index=True,
+ max_length=255,
+ verbose_name="Source ID",
+ ),
+ ),
+ (
+ "scielo_issn",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ max_length=9,
+ null=True,
+ verbose_name="SciELO ISSN",
+ ),
+ ),
+ (
+ "acronym",
+ models.CharField(
+ blank=True,
+ default="",
+ max_length=64,
+ null=True,
+ verbose_name="Source Acronym",
+ ),
+ ),
+ (
+ "title",
+ models.CharField(
+ max_length=255,
+ verbose_name="Source Title",
+ ),
+ ),
+ (
+ "identifiers",
+ models.JSONField(
+ blank=True,
+ default=dict,
+ null=True,
+ verbose_name="Identifiers",
+ ),
+ ),
+ (
+ "publisher_name",
+ models.JSONField(
+ blank=True,
+ default=list,
+ null=True,
+ verbose_name="Publisher Name",
+ ),
+ ),
+ (
+ "subject_areas",
+ models.JSONField(
+ default=list,
+ verbose_name="Subject Areas (CAPES)",
+ ),
+ ),
+ (
+ "wos_subject_areas",
+ models.JSONField(
+ default=list,
+ verbose_name="Subject Areas (WoS)",
+ ),
+ ),
+ (
+ "default_lang",
+ models.CharField(
+ blank=True,
+ max_length=8,
+ null=True,
+ verbose_name="Default Language",
+ ),
+ ),
+ (
+ "publication_date",
+ models.CharField(
+ blank=True,
+ max_length=32,
+ null=True,
+ verbose_name="Publication Date",
+ ),
+ ),
+ (
+ "publication_year",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ max_length=4,
+ null=True,
+ verbose_name="Publication Year",
+ ),
+ ),
+ (
+ "extra_data",
+ models.JSONField(
+ blank=True,
+ default=dict,
+ null=True,
+ verbose_name="Extra Data",
+ ),
+ ),
+ (
+ "collection",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to="collection.collection",
+ verbose_name="Collection",
+ ),
+ ),
+ (
+ "creator",
+ models.ForeignKey(
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_creator",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Creator",
+ ),
+ ),
+ (
+ "updated_by",
+ models.ForeignKey(
+ blank=True,
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_last_mod_user",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Updater",
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "Source",
+ "verbose_name_plural": "Sources",
+ "unique_together": {("collection", "source_type", "source_id")},
+ "indexes": [
+ models.Index(
+ fields=["collection", "source_type"],
+ name="source_collection_type_idx",
+ ),
+ models.Index(
+ fields=["collection", "scielo_issn"],
+ name="source_collection_issn_idx",
+ ),
+ ],
+ },
+ ),
+ ]
diff --git a/source/migrations/0002_source_access_type.py b/source/migrations/0002_source_access_type.py
new file mode 100644
index 0000000..e148c15
--- /dev/null
+++ b/source/migrations/0002_source_access_type.py
@@ -0,0 +1,25 @@
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("source", "0001_initial"),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name="source",
+ name="access_type",
+ field=models.CharField(
+ blank=True,
+ choices=[
+ ("open_access", "Open Access"),
+ ("commercial", "Commercial"),
+ ],
+ db_index=True,
+ max_length=32,
+ null=True,
+ verbose_name="Access Type",
+ ),
+ ),
+ ]
diff --git a/source/migrations/0003_alter_source_title.py b/source/migrations/0003_alter_source_title.py
new file mode 100644
index 0000000..354a82a
--- /dev/null
+++ b/source/migrations/0003_alter_source_title.py
@@ -0,0 +1,15 @@
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("source", "0002_source_access_type"),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name="source",
+ name="title",
+ field=models.CharField(max_length=500, verbose_name="Source Title"),
+ ),
+ ]
diff --git a/source/migrations/__init__.py b/source/migrations/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/source/migrations/__init__.py
@@ -0,0 +1 @@
+
diff --git a/source/models.py b/source/models.py
new file mode 100644
index 0000000..48d3e00
--- /dev/null
+++ b/source/models.py
@@ -0,0 +1,219 @@
+from django.db import models
+from django.utils.translation import gettext_lazy as _
+
+from collection.models import Collection
+from core.models import CommonControlField
+
+
+class Source(CommonControlField):
+ SOURCE_TYPE_JOURNAL = "journal"
+ SOURCE_TYPE_BOOK = "book"
+ SOURCE_TYPE_PREPRINT_SERVER = "preprint_server"
+ SOURCE_TYPE_DATA_REPOSITORY = "data_repository"
+ SOURCE_TYPE_OTHER = "other"
+ SOURCE_TYPE_CHOICES = (
+ (SOURCE_TYPE_JOURNAL, _("Journal")),
+ (SOURCE_TYPE_BOOK, _("Book")),
+ (SOURCE_TYPE_PREPRINT_SERVER, _("Preprint Server")),
+ (SOURCE_TYPE_DATA_REPOSITORY, _("Data Repository")),
+ (SOURCE_TYPE_OTHER, _("Other")),
+ )
+
+ ACCESS_TYPE_OPEN_ACCESS = "open_access"
+ ACCESS_TYPE_COMMERCIAL = "commercial"
+ ACCESS_TYPE_CHOICES = (
+ (ACCESS_TYPE_OPEN_ACCESS, _("Open Access")),
+ (ACCESS_TYPE_COMMERCIAL, _("Commercial")),
+ )
+
+ collection = models.ForeignKey(
+ Collection,
+ verbose_name=_("Collection"),
+ on_delete=models.CASCADE,
+ blank=False,
+ null=False,
+ db_index=True,
+ )
+
+ source_type = models.CharField(
+ verbose_name=_("Source Type"),
+ max_length=32,
+ choices=SOURCE_TYPE_CHOICES,
+ blank=False,
+ null=False,
+ db_index=True,
+ )
+
+ source_id = models.CharField(
+ verbose_name=_("Source ID"),
+ max_length=255,
+ blank=False,
+ null=False,
+ db_index=True,
+ )
+
+ scielo_issn = models.CharField(
+ verbose_name=_("SciELO ISSN"),
+ max_length=9,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ acronym = models.CharField(
+ verbose_name=_("Source Acronym"),
+ max_length=64,
+ blank=True,
+ null=True,
+ default="",
+ )
+
+ title = models.CharField(
+ verbose_name=_("Source Title"),
+ max_length=500,
+ blank=False,
+ null=False,
+ )
+
+ identifiers = models.JSONField(
+ verbose_name=_("Identifiers"),
+ null=True,
+ blank=True,
+ default=dict,
+ )
+
+ publisher_name = models.JSONField(
+ verbose_name=_("Publisher Name"),
+ blank=True,
+ null=True,
+ default=list,
+ )
+
+ subject_areas = models.JSONField(
+ verbose_name=_("Subject Areas (CAPES)"),
+ null=False,
+ blank=False,
+ default=list,
+ )
+
+ wos_subject_areas = models.JSONField(
+ verbose_name=_("Subject Areas (WoS)"),
+ null=False,
+ blank=False,
+ default=list,
+ )
+
+ default_lang = models.CharField(
+ verbose_name=_("Default Language"),
+ max_length=8,
+ blank=True,
+ null=True,
+ )
+
+ publication_date = models.CharField(
+ verbose_name=_("Publication Date"),
+ max_length=32,
+ blank=True,
+ null=True,
+ )
+
+ publication_year = models.CharField(
+ verbose_name=_("Publication Year"),
+ max_length=4,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ access_type = models.CharField(
+ verbose_name=_("Access Type"),
+ max_length=32,
+ choices=ACCESS_TYPE_CHOICES,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ extra_data = models.JSONField(
+ verbose_name=_("Extra Data"),
+ null=True,
+ blank=True,
+ default=dict,
+ )
+
+ def __str__(self):
+ return f"{self.collection.acron3} - {self.source_type} - {self.source_id}"
+
+ @staticmethod
+ def _extract_issns(identifiers):
+ if not isinstance(identifiers, dict):
+ return set()
+
+ return {
+ value
+ for key, value in identifiers.items()
+ if value and "issn" in str(key).lower()
+ }
+
+ @classmethod
+ def metadata(cls, collection=None):
+ queryset = cls.objects.select_related("collection").only(
+ "acronym",
+ "collection__acron3",
+ "default_lang",
+ "extra_data",
+ "identifiers",
+ "publication_date",
+ "publication_year",
+ "access_type",
+ "publisher_name",
+ "scielo_issn",
+ "source_id",
+ "source_type",
+ "subject_areas",
+ "title",
+ "wos_subject_areas",
+ )
+
+ if collection:
+ queryset = queryset.filter(collection=collection)
+
+ for source in queryset.iterator():
+ identifiers = source.identifiers or {}
+ yield {
+ "acronym": source.acronym,
+ "collection": source.collection.acron3,
+ "default_lang": source.default_lang,
+ "extra_data": source.extra_data or {},
+ "identifiers": identifiers,
+ "issns": cls._extract_issns(identifiers),
+ "publication_date": source.publication_date,
+ "publication_year": source.publication_year,
+ "access_type": source.access_type,
+ "publisher_name": source.publisher_name or [],
+ "scielo_issn": source.scielo_issn,
+ "source_id": source.source_id,
+ "source_type": source.source_type,
+ "subject_areas": source.subject_areas or [],
+ "title": source.title,
+ "wos_subject_areas": source.wos_subject_areas or [],
+ }
+
+ class Meta:
+ verbose_name = _("Source")
+ verbose_name_plural = _("Sources")
+ unique_together = (
+ "collection",
+ "source_type",
+ "source_id",
+ )
+ indexes = [
+ models.Index(
+ fields=["collection", "source_type"],
+ name="source_collection_type_idx",
+ ),
+ models.Index(
+ fields=["collection", "scielo_issn"],
+ name="source_collection_issn_idx",
+ ),
+ ]
diff --git a/source/services/__init__.py b/source/services/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/source/services/__init__.py
@@ -0,0 +1 @@
+
diff --git a/source/services/books.py b/source/services/books.py
new file mode 100644
index 0000000..df9bd4d
--- /dev/null
+++ b/source/services/books.py
@@ -0,0 +1,137 @@
+from collection.models import Collection
+from source.models import Source
+
+
+BOOKS_COLLECTION_ACRONYM = "books"
+
+
+def get_books_collection(acronym=BOOKS_COLLECTION_ACRONYM):
+ return Collection.objects.get(acron3=acronym)
+
+
+def upsert_monograph_source(
+ payload,
+ collection,
+ user=None,
+ force_update=True,
+ source_url=None,
+ last_seq=None,
+):
+ if payload.get("TYPE") != "Monograph":
+ return None
+
+ source, created = Source.objects.get_or_create(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_BOOK,
+ source_id=str(payload.get("id")),
+ )
+
+ if created and user:
+ source.creator = user
+
+ if created or force_update:
+ source.scielo_issn = None
+ source.acronym = ""
+ source.title = payload.get("title") or str(payload.get("id"))
+ source.identifiers = _build_source_identifiers(payload)
+ source.publisher_name = _as_list(payload.get("publisher"))
+ source.subject_areas = []
+ source.wos_subject_areas = []
+ source.default_lang = payload.get("language") or None
+ source.publication_date = payload.get("publication_date") or None
+ source.publication_year = _normalize_year(payload.get("year"))
+ source.access_type = _normalize_access_type(payload.get("is_comercial"))
+ source.extra_data = _build_source_extra_data(
+ payload,
+ source_url=source_url,
+ last_seq=last_seq,
+ )
+
+ if user:
+ source.updated_by = user
+
+ source.save()
+ return source
+
+
+def delete_book_source(collection, book_id):
+ return Source.objects.filter(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_BOOK,
+ source_id=str(book_id),
+ ).delete()
+
+
+def _build_source_identifiers(payload):
+ identifiers = {
+ "book_id": str(payload.get("id")) if payload.get("id") is not None else None,
+ "isbn": payload.get("isbn"),
+ "eisbn": payload.get("eisbn"),
+ "doi": payload.get("doi_number"),
+ }
+ return _compact_dict(identifiers)
+
+
+def _build_source_extra_data(payload, source_url=None, last_seq=None):
+ extra_data = {
+ "raw_type": payload.get("TYPE"),
+ "source_url": source_url,
+ "last_seq": last_seq,
+ "visible": payload.get("visible"),
+ "city": payload.get("city"),
+ "country": payload.get("country"),
+ "pages": payload.get("pages"),
+ "collection_data": payload.get("collection"),
+ "creators": payload.get("creators"),
+ "is_comercial": payload.get("is_comercial"),
+ "use_licence": payload.get("use_licence"),
+ "price_reais": payload.get("price_reais"),
+ "price_dollar": payload.get("price_dollar"),
+ "shopping_info": payload.get("shopping_info"),
+ "serie": payload.get("serie"),
+ "format": payload.get("format"),
+ "translated_titles": payload.get("translated_titles"),
+ "translated_synopses": payload.get("translated_synopses"),
+ "synopsis": payload.get("synopsis"),
+ "primary_descriptor": payload.get("primary_descriptor"),
+ "translated_primary_descriptors": payload.get("translated_primary_descriptors"),
+ }
+ return _compact_dict(extra_data)
+
+
+def _as_list(value):
+ if not value:
+ return []
+
+ if isinstance(value, list):
+ return value
+
+ return [value]
+
+
+def _normalize_year(value):
+ if value in (None, ""):
+ return None
+ return str(value)[:4]
+
+
+def _normalize_access_type(value):
+ if value in (None, ""):
+ return None
+
+ if isinstance(value, str):
+ normalized = value.strip().lower()
+ if normalized in {"true", "1", "yes", "y", "sim"}:
+ return Source.ACCESS_TYPE_COMMERCIAL
+ if normalized in {"false", "0", "no", "n", "nao", "não"}:
+ return Source.ACCESS_TYPE_OPEN_ACCESS
+
+ return Source.ACCESS_TYPE_COMMERCIAL if bool(value) else Source.ACCESS_TYPE_OPEN_ACCESS
+
+
+def _compact_dict(data):
+ return {
+ key: value
+ for key, value in data.items()
+ if value not in (None, "", [], {}, ())
+ }
diff --git a/source/services/journals.py b/source/services/journals.py
new file mode 100644
index 0000000..ac133f6
--- /dev/null
+++ b/source/services/journals.py
@@ -0,0 +1,118 @@
+from django.db.models import Q
+
+from collection.models import Collection
+from source.models import Source
+
+
+def get_collection(acronym):
+ return Collection.objects.filter(acron3=acronym).first()
+
+
+def upsert_journal_source(
+ journal,
+ collection,
+ user=None,
+ force_update=True,
+ load_mode=None,
+):
+ scielo_issn = _value(journal, "scielo_issn")
+ if not scielo_issn:
+ return None
+
+ source, created = Source.objects.get_or_create(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_JOURNAL,
+ source_id=scielo_issn,
+ )
+
+ if created and user:
+ source.creator = user
+
+ if created or force_update:
+ source.scielo_issn = scielo_issn
+ source.acronym = _value(journal, "acronym") or ""
+ source.title = _value(journal, "title") or scielo_issn
+ source.identifiers = _build_source_identifiers(journal)
+ source.publisher_name = _as_list(_value(journal, "publisher_name"))
+ source.subject_areas = _as_list(_value(journal, "subject_areas"))
+ source.wos_subject_areas = _as_list(_value(journal, "wos_subject_areas"))
+ source.default_lang = None
+ source.publication_date = None
+ source.publication_year = None
+ source.extra_data = _compact_dict(
+ {
+ "collection_acronym": _value(journal, "collection_acronym"),
+ "load_mode": load_mode,
+ }
+ )
+
+ if user:
+ source.updated_by = user
+
+ source.save()
+ return source
+
+
+def find_journal_source_by_issns(collection, issns):
+ for issn in filter(None, issns or []):
+ source = (
+ Source.objects.filter(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_JOURNAL,
+ )
+ .filter(
+ Q(scielo_issn=issn)
+ | Q(source_id=issn)
+ | Q(identifiers__electronic_issn=issn)
+ | Q(identifiers__print_issn=issn)
+ | Q(identifiers__scielo_issn=issn)
+ )
+ .first()
+ )
+ if source:
+ return source
+ return None
+
+
+def find_journal_source_by_acronym(collection, acronym):
+ if not acronym:
+ return None
+
+ return Source.objects.filter(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_JOURNAL,
+ acronym=acronym,
+ ).first()
+
+
+def _build_source_identifiers(journal):
+ identifiers = {
+ "electronic_issn": _value(journal, "electronic_issn"),
+ "print_issn": _value(journal, "print_issn"),
+ "scielo_issn": _value(journal, "scielo_issn"),
+ }
+ return _compact_dict(identifiers)
+
+
+def _as_list(value):
+ if not value:
+ return []
+
+ if isinstance(value, list):
+ return value
+
+ return [value]
+
+
+def _value(data, key, default=None):
+ if isinstance(data, dict):
+ return data.get(key, default)
+ return getattr(data, key, default)
+
+
+def _compact_dict(data):
+ return {
+ key: value
+ for key, value in data.items()
+ if value not in (None, "", [], {}, ())
+ }
diff --git a/source/tasks.py b/source/tasks.py
new file mode 100644
index 0000000..eb1633b
--- /dev/null
+++ b/source/tasks.py
@@ -0,0 +1,148 @@
+import logging
+
+from django.utils.translation import gettext as _
+from django.conf import settings
+
+from collection.models import Collection
+from config import celery_app
+from core.collectors import articlemeta as articlemeta_collector
+from core.collectors import scielo_books as scielo_books_collector
+from core.utils.request_utils import _get_user
+from source.services import books as books_service
+from source.services import journals as journal_service
+
+
+def load_sources_from_article_meta(
+ collections=None,
+ force_update=True,
+ user=None,
+ mode="thrift",
+):
+ collection_codes = collections or Collection.acron3_list()
+
+ for collection_code in collection_codes:
+ logging.info(
+ "Loading sources from Article Meta. Collection: %s, Mode: %s",
+ collection_code,
+ mode,
+ )
+
+ for journal in articlemeta_collector.iter_journals(
+ collection=collection_code,
+ mode=mode,
+ ):
+ collection = journal_service.get_collection(journal.collection_acronym)
+ if not collection:
+ logging.error(
+ "Collection %s does not exist",
+ journal.collection_acronym,
+ )
+ continue
+
+ source = journal_service.upsert_journal_source(
+ journal,
+ collection=collection,
+ user=user,
+ force_update=force_update,
+ load_mode=mode,
+ )
+ logging.info(
+ "Source %s upserted for collection %s",
+ source.source_id if source else None,
+ collection.acron3,
+ )
+
+ return True
+
+
+def load_sources_from_scielo_books(
+ collection="books",
+ db_name=settings.SCIELO_BOOKS_DB_NAME,
+ since=0,
+ limit=settings.SCIELO_BOOKS_LIMIT,
+ force_update=True,
+ headers=None,
+ base_url=None,
+ user=None,
+):
+ collection_obj = books_service.get_books_collection(collection)
+
+ logging.info(
+ "Loading sources from SciELO Books. Collection: %s, DB: %s, Since: %s, Limit: %s",
+ collection,
+ db_name,
+ since,
+ limit,
+ )
+
+ for item in scielo_books_collector.iter_change_documents(
+ base_url=base_url,
+ db_name=db_name,
+ since=since,
+ limit=limit,
+ headers=headers,
+ ):
+ change = item["change"]
+
+ if item["deleted"]:
+ books_service.delete_book_source(collection_obj, change.get("id"))
+ continue
+
+ payload = item["payload"] or {}
+ if payload.get("TYPE") != "Monograph":
+ continue
+
+ books_service.upsert_monograph_source(
+ payload,
+ collection=collection_obj,
+ user=user,
+ force_update=force_update,
+ source_url=item.get("source_url"),
+ last_seq=change.get("seq"),
+ )
+
+ return True
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Sources (Article Meta)"), queue="load")
+def task_load_sources_from_article_meta(
+ self,
+ collections=None,
+ force_update=True,
+ user_id=None,
+ username=None,
+ mode="thrift",
+):
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return load_sources_from_article_meta(
+ collections=collections,
+ force_update=force_update,
+ user=user,
+ mode=mode,
+ )
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Sources (SciELO Books)"), queue="load")
+def task_load_sources_from_scielo_books(
+ self,
+ collection="books",
+ db_name=settings.SCIELO_BOOKS_DB_NAME,
+ since=0,
+ limit=settings.SCIELO_BOOKS_LIMIT,
+ force_update=True,
+ headers=None,
+ base_url=None,
+ user_id=None,
+ username=None,
+):
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return load_sources_from_scielo_books(
+ collection=collection,
+ db_name=db_name,
+ since=since,
+ limit=limit,
+ force_update=force_update,
+ headers=headers,
+ base_url=base_url,
+ user=user,
+ )
diff --git a/source/tests.py b/source/tests.py
new file mode 100644
index 0000000..a182f4e
--- /dev/null
+++ b/source/tests.py
@@ -0,0 +1,133 @@
+from django.test import TestCase
+
+from collection.models import Collection
+
+from .models import Source
+from .services import books as books_service
+from .services import journals as journal_service
+
+
+class SourceMetadataTests(TestCase):
+ def test_source_type_choices_include_scielo_non_journal_sources(self):
+ self.assertIn(
+ (Source.SOURCE_TYPE_PREPRINT_SERVER, "Preprint Server"),
+ [(value, str(label)) for value, label in Source.SOURCE_TYPE_CHOICES],
+ )
+ self.assertIn(
+ (Source.SOURCE_TYPE_DATA_REPOSITORY, "Data Repository"),
+ [(value, str(label)) for value, label in Source.SOURCE_TYPE_CHOICES],
+ )
+
+ def test_metadata_exposes_generic_and_journal_fields(self):
+ collection = Collection.objects.create(acron3="scl", acron2="sc")
+ Source.objects.create(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_JOURNAL,
+ source_id="1234-5678",
+ scielo_issn="1234-5678",
+ acronym="testjou",
+ title="Test Journal",
+ identifiers={
+ "electronic_issn": "1234-5678",
+ "print_issn": "8765-4321",
+ "doi": "10.1590/example",
+ },
+ publisher_name=["SciELO"],
+ subject_areas=["Health Sciences"],
+ wos_subject_areas=["Medicine"],
+ default_lang="en",
+ publication_date="2024-01-15",
+ publication_year="2024",
+ extra_data={"country": "BR"},
+ )
+
+ metadata = list(Source.metadata(collection=collection))
+
+ self.assertEqual(len(metadata), 1)
+ self.assertEqual(metadata[0]["source_type"], Source.SOURCE_TYPE_JOURNAL)
+ self.assertEqual(metadata[0]["source_id"], "1234-5678")
+ self.assertEqual(metadata[0]["scielo_issn"], "1234-5678")
+ self.assertEqual(metadata[0]["issns"], {"1234-5678", "8765-4321"})
+ self.assertEqual(metadata[0]["title"], "Test Journal")
+
+ def test_upsert_monograph_source_maps_scielo_books_payload(self):
+ collection = Collection.objects.create(acron3="books", acron2="bk")
+
+ source = books_service.upsert_monograph_source(
+ {
+ "TYPE": "Monograph",
+ "id": "abcd1",
+ "title": "Sample Book",
+ "isbn": "9788578791889",
+ "eisbn": "9788578791880",
+ "doi_number": "10.1234/book",
+ "language": "pt",
+ "publication_date": "2024-05-20",
+ "year": "2024",
+ "publisher": "SciELO Books",
+ "is_comercial": False,
+ "visible": True,
+ },
+ collection=collection,
+ )
+
+ self.assertEqual(source.source_type, Source.SOURCE_TYPE_BOOK)
+ self.assertEqual(source.source_id, "abcd1")
+ self.assertEqual(source.identifiers["isbn"], "9788578791889")
+ self.assertEqual(source.default_lang, "pt")
+ self.assertEqual(source.publication_year, "2024")
+ self.assertEqual(source.access_type, Source.ACCESS_TYPE_OPEN_ACCESS)
+
+ def test_upsert_monograph_source_accepts_long_real_world_title(self):
+ collection = Collection.objects.create(acron3="books", acron2="bk")
+ title = (
+ "O Estado da Arte sobre Refugiados, Deslocados Internos, "
+ "Deslocados Ambientais e Apatridas no Brasil: atualizacao do "
+ "Diretorio Nacional do ACNUR de teses, dissertacoes, trabalhos "
+ "de conclusao de curso de graduacao em Joao Pessoa (Paraiba) e "
+ "artigos (2007 a 2017)"
+ )
+
+ source = books_service.upsert_monograph_source(
+ {
+ "TYPE": "Monograph",
+ "id": "9zzts",
+ "title": title,
+ },
+ collection=collection,
+ )
+
+ self.assertEqual(source.title, title)
+
+ def test_upsert_journal_source_maps_articlemeta_payload(self):
+ collection = Collection.objects.create(acron3="scl", acron2="sc")
+
+ source = journal_service.upsert_journal_source(
+ {
+ "collection_acronym": "scl",
+ "scielo_issn": "1234-5678",
+ "electronic_issn": "1234-5678",
+ "print_issn": "8765-4321",
+ "acronym": "testjou",
+ "title": "Test Journal",
+ "publisher_name": "SciELO",
+ "subject_areas": ["Health Sciences"],
+ "wos_subject_areas": ["Medicine"],
+ },
+ collection=collection,
+ load_mode="thrift",
+ )
+
+ self.assertEqual(source.source_type, Source.SOURCE_TYPE_JOURNAL)
+ self.assertEqual(source.source_id, "1234-5678")
+ self.assertEqual(source.identifiers["electronic_issn"], "1234-5678")
+ self.assertEqual(source.publisher_name, ["SciELO"])
+ self.assertEqual(source.extra_data["load_mode"], "thrift")
+ self.assertEqual(
+ journal_service.find_journal_source_by_issns(collection, ["8765-4321"]).pk,
+ source.pk,
+ )
+ self.assertEqual(
+ journal_service.find_journal_source_by_acronym(collection, "testjou").pk,
+ source.pk,
+ )
diff --git a/source/wagtail_hooks.py b/source/wagtail_hooks.py
new file mode 100644
index 0000000..5ffad62
--- /dev/null
+++ b/source/wagtail_hooks.py
@@ -0,0 +1,32 @@
+from django.utils.translation import gettext_lazy as _
+from wagtail.snippets.views.snippets import SnippetViewSet
+
+from .models import Source
+
+
+class SourceSnippetViewSet(SnippetViewSet):
+ model = Source
+ icon = "folder-open-inverse"
+ menu_label = _("Source")
+ menu_order = 200
+
+ list_display = (
+ "collection",
+ "source_type",
+ "source_id",
+ "scielo_issn",
+ "acronym",
+ "title",
+ "publication_year",
+ )
+ list_filter = (
+ "collection",
+ "source_type",
+ "publication_year",
+ )
+ search_fields = (
+ "source_id",
+ "scielo_issn",
+ "acronym",
+ "title",
+ )
diff --git a/start-dev.sh b/start-dev.sh
deleted file mode 100644
index 92d064a..0000000
--- a/start-dev.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-# Change this value to the local ethernet.
-ethernet=wlp0s20f3
-
-# Linux IP.
-export IP=$(/sbin/ip -o -4 addr list $ethernet | awk '{print $4}' | cut -d/ -f1)
-
-# Mac OS IP.
-#export IP=$(ifconfig $ethernet | grep inet | grep -v inet6 | awk '{print $2}')
-
-export DATABASE_URL=postgres://GVRFlLmcCNfGLhsFvSnCioYOPJPYpyfj:BQ4hSUL4rdj5WZLdR8ilDLRQMvCtzo0caMaXDO0olGsmycQjlcZlTVK9DepZR8kk@$IP:5432/scielo_core
-export CELERY_BROKER_URL=redis://$IP:6379/0
-export USE_DOCKER=no
-export IPYTHONDIR=/app/.ipython
-export REDIS_URL=redis://$IP:6379/0
-export CELERY_FLOWER_USER=PhFRdLexbrsBvrrbSXxjcMMOcVOavCrZ
-export CELERY_FLOWER_PASSWORD=QgScyefPrYhHgO6onW61u0nazc5xdBuP4sM7jMRrBBFuA2RjsFhZLp7xbVYZbrwR
-export EMAIL_HOST=$IP
-export SOLR_URL=http://$IP:8983/solr/
-
-
-docker stop scielo_core_local_django
-# workon scms
-python manage.py runserver_plus 0.0.0.0:8000
diff --git a/tracker/choices.py b/tracker/choices.py
index e2c80e2..dfc562c 100644
--- a/tracker/choices.py
+++ b/tracker/choices.py
@@ -1,54 +1,16 @@
from django.utils.translation import gettext_lazy as _
-ERROR = "ERROR"
-EXCEPTION = "EXCEPTION"
-INFO = "INFO"
-WARNING = "WARNING"
-
-EVENT_MSG_TYPE = [
- (ERROR, _("error")),
- (WARNING, _("warning")),
- (INFO, _("info")),
- (EXCEPTION, _("exception")),
-]
-
-
-PROGRESS_STATUS_IGNORED = "IGNORED"
-PROGRESS_STATUS_REPROC = "REPROC"
-PROGRESS_STATUS_TODO = "TODO"
-PROGRESS_STATUS_DOING = "DOING"
-PROGRESS_STATUS_DONE = "DONE"
-PROGRESS_STATUS_PENDING = "PENDING"
-
-PROGRESS_STATUS = (
- (PROGRESS_STATUS_REPROC, _("To reprocess")),
- (PROGRESS_STATUS_TODO, _("To do")),
- (PROGRESS_STATUS_DONE, _("Done")),
- (PROGRESS_STATUS_DOING, _("Doing")),
- (PROGRESS_STATUS_PENDING, _("Pending")),
- (PROGRESS_STATUS_IGNORED, _("ignored")),
-)
-
LOG_FILE_DISCARDED_LINE_REASON_MISSING_METADATA = 'MET'
-LOG_FILE_DISCARDED_LINE_REASON_MISSING_ARTICLE = 'ART'
-LOG_FILE_DISCARDED_LINE_REASON_MISSING_JOURNAL = 'JOU'
+LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT = 'DOC'
+LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE = 'SRC'
LOG_FILE_DISCARDED_LINE_REASON_URL_TRANSLATION = 'URL'
LOG_FILE_DISCARDED_LINE_REASON_DATABASE_ERROR = 'DBE'
LOG_FILE_DISCARDED_LINE_REASON = [
(LOG_FILE_DISCARDED_LINE_REASON_MISSING_METADATA, _("Missing Metadata")),
- (LOG_FILE_DISCARDED_LINE_REASON_MISSING_ARTICLE, _("Missing PIDv2 or PIDv3 or PID Generic")),
- (LOG_FILE_DISCARDED_LINE_REASON_MISSING_JOURNAL, _("Missing ISSN")),
+ (LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT, _("Missing Document")),
+ (LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE, _("Missing Source")),
(LOG_FILE_DISCARDED_LINE_REASON_URL_TRANSLATION, _("URL Translation")),
(LOG_FILE_DISCARDED_LINE_REASON_DATABASE_ERROR, _("Database Error")),
]
-
-
-ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED = 'MUL'
-ARTICLE_EVENT_TYPE_DATA_ERROR = 'ERR'
-
-ARTICLE_EVENT_TYPE = [
- (ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, _("Multiple Articles Returned")),
- (ARTICLE_EVENT_TYPE_DATA_ERROR, _("Data Error")),
-]
diff --git a/tracker/exceptions.py b/tracker/exceptions.py
index 31ed8c8..9ef3267 100644
--- a/tracker/exceptions.py
+++ b/tracker/exceptions.py
@@ -1,26 +1,2 @@
-class ProcEventCreateError(Exception):
- ...
-
-class UnexpectedEventCreateError(Exception):
- ...
-
-class EventCreateError(Exception):
- ...
-
-class EventReportCreateError(Exception):
- ...
-
-class EventReportSaveFileError(Exception):
- ...
-
-class EventReportCreateError(Exception):
- ...
-
-class EventReportDeleteEventsError(Exception):
- ...
-
class LogFileDiscardedLineCreateError(Exception):
...
-
-class ArticleEventError(Exception):
- ...
diff --git a/tracker/migrations/0001_initial.py b/tracker/migrations/0001_initial.py
index f207722..04fdc35 100644
--- a/tracker/migrations/0001_initial.py
+++ b/tracker/migrations/0001_initial.py
@@ -1,13 +1,18 @@
-# Generated by Django 5.0.7 on 2024-08-30 00:52
+# Generated by Codex on 2026-04-27
+import django.db.models.deletion
import uuid
+from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
- dependencies = []
+ dependencies = [
+ ("log_manager", "0001_initial"),
+ migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+ ]
operations = [
migrations.CreateModel(
@@ -24,21 +29,15 @@ class Migration(migrations.Migration):
),
(
"created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
+ models.DateTimeField(auto_now_add=True, verbose_name="Creation date"),
),
(
"exception_type",
- models.TextField(
- blank=True, null=True, verbose_name="Exception Type"
- ),
+ models.TextField(blank=True, null=True, verbose_name="Exception Type"),
),
(
"exception_msg",
- models.TextField(
- blank=True, null=True, verbose_name="Exception Msg"
- ),
+ models.TextField(blank=True, null=True, verbose_name="Exception Msg"),
),
("traceback", models.JSONField(blank=True, null=True)),
("detail", models.JSONField(blank=True, null=True)),
@@ -46,9 +45,148 @@ class Migration(migrations.Migration):
options={
"indexes": [
models.Index(
- fields=["exception_type"], name="tracker_une_excepti_47ede4_idx"
+ fields=["exception_type"],
+ name="tracker_une_excepti_47ede4_idx",
)
],
},
),
+ migrations.CreateModel(
+ name="ArticleEvent",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ (
+ "created",
+ models.DateTimeField(auto_now_add=True, verbose_name="Creation date"),
+ ),
+ (
+ "updated",
+ models.DateTimeField(auto_now=True, verbose_name="Last update date"),
+ ),
+ (
+ "event_type",
+ models.CharField(
+ blank=True,
+ choices=[
+ ("MUL", "Multiple Articles Returned"),
+ ("ERR", "Data Error"),
+ ],
+ max_length=3,
+ null=True,
+ verbose_name="Event Type",
+ ),
+ ),
+ (
+ "message",
+ models.TextField(blank=True, null=True, verbose_name="Message"),
+ ),
+ ("data", models.JSONField(default=dict, verbose_name="Data")),
+ ("handled", models.BooleanField(default=False, verbose_name="Handled")),
+ (
+ "creator",
+ models.ForeignKey(
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_creator",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Creator",
+ ),
+ ),
+ (
+ "updated_by",
+ models.ForeignKey(
+ blank=True,
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_last_mod_user",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Updater",
+ ),
+ ),
+ ],
+ ),
+ migrations.CreateModel(
+ name="LogFileDiscardedLine",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ (
+ "created",
+ models.DateTimeField(auto_now_add=True, verbose_name="Creation date"),
+ ),
+ (
+ "updated",
+ models.DateTimeField(auto_now=True, verbose_name="Last update date"),
+ ),
+ (
+ "error_type",
+ models.CharField(
+ blank=True,
+ choices=[
+ ("MET", "Missing Metadata"),
+ ("DOC", "Missing Document"),
+ ("SRC", "Missing Source"),
+ ("URL", "URL Translation"),
+ ("DBE", "Database Error"),
+ ],
+ max_length=3,
+ null=True,
+ verbose_name="Error Type",
+ ),
+ ),
+ ("data", models.JSONField(default=dict, verbose_name="Data")),
+ (
+ "message",
+ models.TextField(blank=True, null=True, verbose_name="Message"),
+ ),
+ ("handled", models.BooleanField(default=False, verbose_name="Handled")),
+ (
+ "creator",
+ models.ForeignKey(
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_creator",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Creator",
+ ),
+ ),
+ (
+ "log_file",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to="log_manager.logfile",
+ ),
+ ),
+ (
+ "updated_by",
+ models.ForeignKey(
+ blank=True,
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_last_mod_user",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Updater",
+ ),
+ ),
+ ],
+ ),
]
diff --git a/tracker/migrations/0002_remove_articleevent_creator_and_more.py b/tracker/migrations/0002_remove_articleevent_creator_and_more.py
new file mode 100644
index 0000000..ee23c85
--- /dev/null
+++ b/tracker/migrations/0002_remove_articleevent_creator_and_more.py
@@ -0,0 +1,38 @@
+# Generated by Django 5.2.12 on 2026-05-01 22:23
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("tracker", "0001_initial"),
+ ]
+
+ operations = [
+ migrations.RemoveField(
+ model_name="articleevent",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="articleevent",
+ name="updated_by",
+ ),
+ migrations.DeleteModel(
+ name="UnexpectedEvent",
+ ),
+ migrations.RemoveField(
+ model_name="logfilediscardedline",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="logfilediscardedline",
+ name="updated",
+ ),
+ migrations.RemoveField(
+ model_name="logfilediscardedline",
+ name="updated_by",
+ ),
+ migrations.DeleteModel(
+ name="ArticleEvent",
+ ),
+ ]
diff --git a/tracker/migrations/0002_top100articlesfileevent.py b/tracker/migrations/0002_top100articlesfileevent.py
deleted file mode 100644
index 230fb8a..0000000
--- a/tracker/migrations/0002_top100articlesfileevent.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Generated by Django 5.0.7 on 2024-08-30 21:52
-
-import django.db.models.deletion
-from django.conf import settings
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("metrics", "0002_alter_top100articlesfile_status"),
- ("tracker", "0001_initial"),
- migrations.swappable_dependency(settings.AUTH_USER_MODEL),
- ]
-
- operations = [
- migrations.CreateModel(
- name="Top100ArticlesFileEvent",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
- ),
- (
- "updated",
- models.DateTimeField(
- auto_now=True, verbose_name="Last update date"
- ),
- ),
- (
- "status",
- models.CharField(
- blank=True, max_length=64, null=True, verbose_name="Status"
- ),
- ),
- (
- "lines",
- models.IntegerField(
- blank=True, default=0, null=True, verbose_name="Lines"
- ),
- ),
- (
- "message",
- models.TextField(blank=True, null=True, verbose_name="Message"),
- ),
- (
- "creator",
- models.ForeignKey(
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_creator",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Creator",
- ),
- ),
- (
- "file",
- models.ForeignKey(
- blank=True,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- to="metrics.top100articlesfile",
- ),
- ),
- (
- "updated_by",
- models.ForeignKey(
- blank=True,
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_last_mod_user",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Updater",
- ),
- ),
- ],
- options={
- "verbose_name_plural": "Top 100 Article File Events",
- },
- ),
- ]
diff --git a/tracker/migrations/0003_logfilediscardedline_delete_top100articlesfileevent.py b/tracker/migrations/0003_logfilediscardedline_delete_top100articlesfileevent.py
deleted file mode 100644
index 6e37a9f..0000000
--- a/tracker/migrations/0003_logfilediscardedline_delete_top100articlesfileevent.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Generated by Django 5.0.7 on 2025-03-07 16:55
-
-import django.db.models.deletion
-from django.conf import settings
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("log_manager", "0002_alter_collectionconfig_unique_together_and_more"),
- ("tracker", "0002_top100articlesfileevent"),
- migrations.swappable_dependency(settings.AUTH_USER_MODEL),
- ]
-
- operations = [
- migrations.CreateModel(
- name="LogFileDiscardedLine",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
- ),
- (
- "updated",
- models.DateTimeField(
- auto_now=True, verbose_name="Last update date"
- ),
- ),
- (
- "error_type",
- models.CharField(
- blank=True,
- choices=[
- ("MET", "Missing Metadata"),
- ("ART", "Missing Article"),
- ("JOU", "Missing Journal"),
- ],
- max_length=3,
- null=True,
- verbose_name="Error Type",
- ),
- ),
- ("data", models.JSONField(default=dict, verbose_name="Data")),
- (
- "message",
- models.TextField(blank=True, null=True, verbose_name="Message"),
- ),
- ("handled", models.BooleanField(default=False, verbose_name="Handled")),
- (
- "creator",
- models.ForeignKey(
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_creator",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Creator",
- ),
- ),
- (
- "log_file",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="log_manager.logfile",
- ),
- ),
- (
- "updated_by",
- models.ForeignKey(
- blank=True,
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_last_mod_user",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Updater",
- ),
- ),
- ],
- options={
- "abstract": False,
- },
- ),
- migrations.DeleteModel(
- name="Top100ArticlesFileEvent",
- ),
- ]
diff --git a/tracker/migrations/0004_alter_logfilediscardedline_error_type.py b/tracker/migrations/0004_alter_logfilediscardedline_error_type.py
deleted file mode 100644
index 1061793..0000000
--- a/tracker/migrations/0004_alter_logfilediscardedline_error_type.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Generated by Django 5.0.7 on 2025-03-27 20:40
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("tracker", "0003_logfilediscardedline_delete_top100articlesfileevent"),
- ]
-
- operations = [
- migrations.AlterField(
- model_name="logfilediscardedline",
- name="error_type",
- field=models.CharField(
- blank=True,
- choices=[
- ("MET", "Missing Metadata"),
- ("ART", "Missing Article"),
- ("JOU", "Missing Journal"),
- ("URL", "URL Translation"),
- ],
- max_length=3,
- null=True,
- verbose_name="Error Type",
- ),
- ),
- ]
diff --git a/tracker/migrations/0005_articleevent.py b/tracker/migrations/0005_articleevent.py
deleted file mode 100644
index 859910e..0000000
--- a/tracker/migrations/0005_articleevent.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Generated by Django 5.0.7 on 2025-05-23 17:27
-
-import django.db.models.deletion
-from django.conf import settings
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("tracker", "0004_alter_logfilediscardedline_error_type"),
- migrations.swappable_dependency(settings.AUTH_USER_MODEL),
- ]
-
- operations = [
- migrations.CreateModel(
- name="ArticleEvent",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
- ),
- (
- "updated",
- models.DateTimeField(
- auto_now=True, verbose_name="Last update date"
- ),
- ),
- (
- "event_type",
- models.CharField(
- blank=True,
- choices=[
- ("MUL", "Multiple Articles Returned"),
- ("ERR", "Data Error"),
- ],
- max_length=3,
- null=True,
- verbose_name="Event Type",
- ),
- ),
- (
- "message",
- models.TextField(blank=True, null=True, verbose_name="Message"),
- ),
- ("data", models.JSONField(default=dict, verbose_name="Data")),
- ("handled", models.BooleanField(default=False, verbose_name="Handled")),
- (
- "creator",
- models.ForeignKey(
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_creator",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Creator",
- ),
- ),
- (
- "updated_by",
- models.ForeignKey(
- blank=True,
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_last_mod_user",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Updater",
- ),
- ),
- ],
- options={
- "abstract": False,
- },
- ),
- ]
diff --git a/tracker/migrations/0006_alter_logfilediscardedline_error_type.py b/tracker/migrations/0006_alter_logfilediscardedline_error_type.py
deleted file mode 100644
index fb7f74a..0000000
--- a/tracker/migrations/0006_alter_logfilediscardedline_error_type.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Generated by Django 5.0.7 on 2025-06-14 10:46
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("tracker", "0005_articleevent"),
- ]
-
- operations = [
- migrations.AlterField(
- model_name="logfilediscardedline",
- name="error_type",
- field=models.CharField(
- blank=True,
- choices=[
- ("MET", "Missing Metadata"),
- ("ART", "Missing Article"),
- ("JOU", "Missing Journal"),
- ("URL", "URL Translation"),
- ("DBE", "Database Error"),
- ],
- max_length=3,
- null=True,
- verbose_name="Error Type",
- ),
- ),
- ]
diff --git a/tracker/migrations/0007_alter_logfilediscardedline_error_type.py b/tracker/migrations/0007_alter_logfilediscardedline_error_type.py
deleted file mode 100644
index f9ffebe..0000000
--- a/tracker/migrations/0007_alter_logfilediscardedline_error_type.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Generated by Django 5.0.7 on 2025-08-09 21:04
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("tracker", "0006_alter_logfilediscardedline_error_type"),
- ]
-
- operations = [
- migrations.AlterField(
- model_name="logfilediscardedline",
- name="error_type",
- field=models.CharField(
- blank=True,
- choices=[
- ("MET", "Missing Metadata"),
- ("ART", "Missing PIDv2 or PIDv3 or PID Generic"),
- ("JOU", "Missing ISSN"),
- ("URL", "URL Translation"),
- ("DBE", "Database Error"),
- ],
- max_length=3,
- null=True,
- verbose_name="Error Type",
- ),
- ),
- ]
diff --git a/tracker/models.py b/tracker/models.py
index 77086ee..a394ed6 100644
--- a/tracker/models.py
+++ b/tracker/models.py
@@ -1,65 +1,13 @@
-import json
-import logging
-import traceback
-import uuid
-
-from datetime import datetime
-
-from django.core.files.base import ContentFile
from django.db import models
from django.utils.translation import gettext_lazy as _
-from core.models import CommonControlField
from log_manager.models import LogFile
from tracker import choices
-
-from .exceptions import *
+from .exceptions import LogFileDiscardedLineCreateError
-class ArticleEvent(CommonControlField):
- event_type = models.CharField(
- _("Event Type"),
- choices=choices.ARTICLE_EVENT_TYPE,
- max_length=3,
- null=True,
- blank=True,
- )
-
- message = models.TextField(
- _("Message"),
- null=True,
- blank=True,
- )
-
- data = models.JSONField(
- _("Data"),
- default=dict,
- )
-
- handled = models.BooleanField(
- _("Handled"),
- default=False
- )
-
- @classmethod
- def create(cls, event_type, message, data):
- try:
- obj = cls()
- obj.event_type = event_type
- obj.message = message
- obj.data = data
- obj.save()
- except Exception as exc:
- raise ArticleEventError(
- f"Unable to create ArticleEvent ({data} - {event_type} - {message}). EXCEPTION {exc}"
- )
- return obj
-
- def __str__(self):
- return f"{self.event_type} - {self.message}"
-
-
-class LogFileDiscardedLine(CommonControlField):
+class LogFileDiscardedLine(models.Model):
+ created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True)
log_file = models.ForeignKey(
LogFile,
on_delete=models.CASCADE,
@@ -108,174 +56,4 @@ def __str__(self):
return f"{self.data} - {self.message}"
-class UnexpectedEvent(models.Model):
- id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
- created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True)
- exception_type = models.TextField(_("Exception Type"), null=True, blank=True)
- exception_msg = models.TextField(_("Exception Msg"), null=True, blank=True)
- traceback = models.JSONField(null=True, blank=True)
- detail = models.JSONField(null=True, blank=True)
-
- class Meta:
- indexes = [
- models.Index(fields=["exception_type"]),
- ]
-
- def __str__(self):
- return f"{self.exception_msg}"
-
- @property
- def data(self):
- return dict(
- created=self.created.isoformat(),
- exception_type=self.exception_type,
- exception_msg=self.exception_msg,
- traceback=json.dumps(self.traceback),
- detail=json.dumps(self.detail),
- )
-
- @classmethod
- def create(
- cls,
- exception=None,
- exc_traceback=None,
- detail=None,
- ):
- try:
- if exception:
- logging.exception(exception)
-
- obj = cls()
- obj.exception_msg = str(exception)
- obj.exception_type = str(type(exception))
- try:
- json.dumps(detail)
- obj.detail = detail
- except Exception as e:
- obj.detail = str(detail)
- if exc_traceback:
- obj.traceback = traceback.format_tb(exc_traceback)
- obj.save()
- return obj
- except Exception as exc:
- raise UnexpectedEventCreateError(
- f"Unable to create unexpected event ({exception} {exc_traceback}). EXCEPTION {exc}"
- )
-
-
-class Event(CommonControlField):
- id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
- message = models.TextField(_("Message"), null=True, blank=True)
- message_type = models.CharField(
- _("Message type"),
- choices=choices.EVENT_MSG_TYPE,
- max_length=16,
- null=True,
- blank=True,
- )
- detail = models.JSONField(null=True, blank=True)
- unexpected_event = models.ForeignKey(
- 'UnexpectedEvent', on_delete=models.SET_NULL, null=True, blank=True
- )
-
- class Meta:
- abstract = True
- indexes = [
- models.Index(fields=["message_type"]),
- ]
-
- @property
- def data(self):
- d = {}
- d["created"] = self.created.isoformat()
- d["user"] = self.user.username
- d.update(
- dict(
- message=self.message, message_type=self.message_type, detail=self.detail
- )
- )
- if self.unexpected_event:
- d.update(self.unexpected_event.data)
- return d
-
- @classmethod
- def create(
- cls,
- user=None,
- message_type=None,
- message=None,
- e=None,
- exc_traceback=None,
- detail=None,
- ):
- try:
- obj = cls()
- obj.creator = user
- obj.message = message
- obj.message_type = message_type
- obj.detail = detail
- obj.save()
-
- if e:
- logging.exception(f"{message}: {e}")
- obj.unexpected_event = UnexpectedEvent.create(
- exception=e,
- exc_traceback=exc_traceback,
- )
- obj.save()
- except Exception as exc:
- raise EventCreateError(
- f"Unable to create Event ({message} {e}). EXCEPTION: {exc}"
- )
- return obj
-
-
-def tracker_file_directory_path(instance, filename):
- d = datetime.now(datetime.timezone.utc)
- return f"tracker/{d.year}/{d.month}/{d.day}/{filename}"
-
-
-class EventReport(CommonControlField):
- file = models.FileField(
- upload_to=tracker_file_directory_path, null=True, blank=True
- )
-
- class Meta:
- abstract = True
-
- def save_file(self, events, ext=None):
- if not events:
- return
- try:
- ext = ".json"
- content = json.dumps(list([item.data for item in events]))
- name = datetime.now(datetime.timezone.utc).isoformat() + ext
- self.file.save(name, ContentFile(content))
- self.delete_events(events)
- except Exception as e:
- raise EventReportSaveFileError(
- f"Unable to save EventReport.file ({name}). Exception: {e}"
- )
-
- def delete_events(self, events):
- for item in events:
- try:
- item.unexpected_event.delete()
- except:
- pass
- try:
- item.delete()
- except:
- pass
-
- @classmethod
- def create(cls, user):
- try:
- obj = cls()
- obj.creator = user
- obj.save()
- except Exception as e:
- raise EventReportCreateError(
- f"Unable to create EventReport. Exception: {e}"
- )
diff --git a/tracker/tasks.py b/tracker/tasks.py
deleted file mode 100644
index ace8145..0000000
--- a/tracker/tasks.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# tasks.py
-from datetime import datetime
-
-from django.contrib.auth import get_user_model
-
-from config import celery_app
-from core.utils.utils import _get_user
-
-from .models import UnexpectedEvent
-
-
-User = get_user_model()
-
-
-@celery_app.task(bind=True, name="Cleanup unexpected events")
-def delete_unexpected_events(self, exception_type, start_date=None, end_date=None, user_id=None, username=None):
- """
- Delete UnexpectedEvent records based on exception type and optional date range.
- """
- user = _get_user(self.request, username=username, user_id=user_id)
-
- if exception_type == '__all__':
- UnexpectedEvent.objects.all().delete()
- return
-
- filters = {'exception_type__icontains': exception_type}
- if start_date:
- start_date = datetime.fromisoformat(start_date)
- filters['created__gte'] = start_date
- if end_date:
- end_date = datetime.fromisoformat(end_date)
- filters['created__lte'] = end_date
-
- UnexpectedEvent.objects.filter(**filters).delete()