mirror of
https://github.com/hashiromer/Upwork-Jobs-scraper-.git
synced 2025-12-29 16:16:01 +00:00
Getting cookies from .env file
This commit is contained in:
6
main.go
6
main.go
@@ -1,13 +1,11 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import "scrapers/upwork"
|
||||||
"scrapers/upwork"
|
|
||||||
)
|
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
|
||||||
p := upwork.InitPipeline()
|
p := upwork.InitPipeline()
|
||||||
err := p.Run("pdf")
|
err := p.Run("")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,13 @@ package upwork
|
|||||||
|
|
||||||
import "time"
|
import "time"
|
||||||
|
|
||||||
|
type LoggedOutError struct {
|
||||||
|
Error struct {
|
||||||
|
Code int `json:"code"`
|
||||||
|
Message string `json:"message"`
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type UpworkApiResponse struct {
|
type UpworkApiResponse struct {
|
||||||
URL string `json:"url"`
|
URL string `json:"url"`
|
||||||
SearchGUID string `json:"searchGuid"`
|
SearchGUID string `json:"searchGuid"`
|
||||||
@@ -276,7 +283,6 @@ type UpworkApiResponse struct {
|
|||||||
Jordan int `json:"Jordan"`
|
Jordan int `json:"Jordan"`
|
||||||
Bulgaria int `json:"Bulgaria"`
|
Bulgaria int `json:"Bulgaria"`
|
||||||
Tunisia int `json:"Tunisia"`
|
Tunisia int `json:"Tunisia"`
|
||||||
CongoTheDemocraticRepublicOfThe int `json:"Congo, the Democratic Republic of the"`
|
|
||||||
UnitedArabEmirates int `json:"United Arab Emirates"`
|
UnitedArabEmirates int `json:"United Arab Emirates"`
|
||||||
Kenya int `json:"Kenya"`
|
Kenya int `json:"Kenya"`
|
||||||
FrenchPolynesia int `json:"French Polynesia"`
|
FrenchPolynesia int `json:"French Polynesia"`
|
||||||
@@ -337,7 +343,7 @@ type UpworkApiResponse struct {
|
|||||||
Togo int `json:"Togo"`
|
Togo int `json:"Togo"`
|
||||||
SouthernAsia int `json:"Southern Asia"`
|
SouthernAsia int `json:"Southern Asia"`
|
||||||
Philippines int `json:"Philippines"`
|
Philippines int `json:"Philippines"`
|
||||||
CoteDIvoire int `json:"Cote d'Ivoire"`
|
CoteDIvoire int `json:"Cote d\'Ivoire"`
|
||||||
Uzbekistan int `json:"Uzbekistan"`
|
Uzbekistan int `json:"Uzbekistan"`
|
||||||
Asia int `json:"Asia"`
|
Asia int `json:"Asia"`
|
||||||
BritishVirginIslands int `json:"British Virgin Islands"`
|
BritishVirginIslands int `json:"British Virgin Islands"`
|
||||||
@@ -474,7 +480,6 @@ type UpworkApiResponse struct {
|
|||||||
Uganda int `json:"Uganda"`
|
Uganda int `json:"Uganda"`
|
||||||
Mexico int `json:"Mexico"`
|
Mexico int `json:"Mexico"`
|
||||||
Suriname int `json:"Suriname"`
|
Suriname int `json:"Suriname"`
|
||||||
Micronesia int `json:"Micronesia"`
|
|
||||||
Greenland int `json:"Greenland"`
|
Greenland int `json:"Greenland"`
|
||||||
} `json:"location"`
|
} `json:"location"`
|
||||||
Timezone struct {
|
Timezone struct {
|
||||||
|
|||||||
@@ -1,8 +1,12 @@
|
|||||||
package upwork
|
package upwork
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
"scrapers/network"
|
"scrapers/network"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Upwork struct {
|
type Upwork struct {
|
||||||
@@ -16,8 +20,8 @@ type UrlArgs struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (u Upwork) ConstructUrl(args UrlArgs) string {
|
func (u Upwork) ConstructUrl(args UrlArgs) string {
|
||||||
url := "https://www.upwork.com/ab/jobs/search/url?q=%s&per_page=%d&sort=recency&payment_verified=1&page=%d"
|
url := "https://www.upwork.com/ab/jobs/search/url?per_page=%d&sort=recency&payment_verified=1&page=%d&q=%s"
|
||||||
return fmt.Sprintf(url, args.Query, args.Per_Page, args.Page)
|
return fmt.Sprintf(url, args.Per_Page, args.Page, args.Query)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (u Upwork) SendRequest(url string) (string, error) {
|
func (u Upwork) SendRequest(url string) (string, error) {
|
||||||
@@ -31,6 +35,45 @@ func (u Upwork) SendRequest(url string) (string, error) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func mergeMaps(m1, m2 map[string]string) map[string]string {
|
||||||
|
// Iterate over m2 and add its key-value pairs to m1
|
||||||
|
for k, v := range m2 {
|
||||||
|
m1[k] = v
|
||||||
|
}
|
||||||
|
return m1
|
||||||
|
}
|
||||||
|
|
||||||
|
func readEnv(filename string) (map[string]string, error) {
|
||||||
|
|
||||||
|
// Open the .env file
|
||||||
|
file, err := os.Open(filename)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Print()
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
// Create a map to store the key-value pairs
|
||||||
|
m := make(map[string]string)
|
||||||
|
|
||||||
|
// Read the file line by line
|
||||||
|
scanner := bufio.NewScanner(file)
|
||||||
|
for scanner.Scan() {
|
||||||
|
// Split the line on the "=" character
|
||||||
|
parts := strings.SplitN(scanner.Text(), "=", 2)
|
||||||
|
if len(parts) == 2 {
|
||||||
|
// Trim leading and trailing whitespace from the key and value
|
||||||
|
key := strings.TrimSpace(parts[0])
|
||||||
|
value := strings.TrimSpace(parts[1])
|
||||||
|
|
||||||
|
// Add the key-value pair to the map
|
||||||
|
m[key] = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return m, nil
|
||||||
|
}
|
||||||
|
|
||||||
func InitUpwork() *Upwork {
|
func InitUpwork() *Upwork {
|
||||||
headers := map[string]string{
|
headers := map[string]string{
|
||||||
"authority": "www.upwork.com",
|
"authority": "www.upwork.com",
|
||||||
@@ -47,6 +90,12 @@ func InitUpwork() *Upwork {
|
|||||||
"x-odesk-user-agent": "oDesk LM",
|
"x-odesk-user-agent": "oDesk LM",
|
||||||
"x-requested-with": "XMLHttpRequest",
|
"x-requested-with": "XMLHttpRequest",
|
||||||
}
|
}
|
||||||
|
auth_headers, err := readEnv("upwork/.env")
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal("Could not read .env.auth file")
|
||||||
|
}
|
||||||
|
headers = mergeMaps(auth_headers, headers)
|
||||||
|
|
||||||
client := network.InitClient(headers)
|
client := network.InitClient(headers)
|
||||||
upwork := Upwork{
|
upwork := Upwork{
|
||||||
UpworkHttpClient: client,
|
UpworkHttpClient: client,
|
||||||
|
|||||||
@@ -24,10 +24,6 @@ func InitPipeline() *UpworkPipeLine {
|
|||||||
func (u *UpworkPipeLine) CombineFiles() error {
|
func (u *UpworkPipeLine) CombineFiles() error {
|
||||||
|
|
||||||
var all_jobs []interface{}
|
var all_jobs []interface{}
|
||||||
// all_filenames, err := filepath.Glob("data/*.json")
|
|
||||||
// if err != nil {
|
|
||||||
// panic(err)
|
|
||||||
// }
|
|
||||||
|
|
||||||
for _, file := range u.filepaths {
|
for _, file := range u.filepaths {
|
||||||
data, err := os.ReadFile(file)
|
data, err := os.ReadFile(file)
|
||||||
@@ -76,41 +72,60 @@ func (u *UpworkPipeLine) saveToFile(data []byte, filename string) error {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func isJSON(s string) bool {
|
func isValidJSON(s string) bool {
|
||||||
var js map[string]interface{}
|
var js map[string]interface{}
|
||||||
return json.Unmarshal([]byte(s), &js) == nil
|
return json.Unmarshal([]byte(s), &js) == nil
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func isApiError(data string) bool {
|
// func print_json(p string) {
|
||||||
|
// b, err := json.MarshalIndent(p, "", " ")
|
||||||
|
// if err != nil {
|
||||||
|
// fmt.Println(err)
|
||||||
|
// return
|
||||||
|
// }
|
||||||
|
// fmt.Println(string(b))
|
||||||
|
// }
|
||||||
|
|
||||||
var result map[string]interface{}
|
func isValidApiResponse(data string) bool {
|
||||||
|
|
||||||
|
// Deserialize the JSON string into the Person struct
|
||||||
|
var api_error LoggedOutError
|
||||||
|
var api_response UpworkApiResponse
|
||||||
|
|
||||||
|
// Deserialize the JSON string into the Person struct
|
||||||
|
is_api_error := json.Unmarshal([]byte(data), &api_error)
|
||||||
|
api_resp := json.Unmarshal([]byte(data), &api_response)
|
||||||
|
|
||||||
|
//A known error occured
|
||||||
|
if is_api_error == nil && api_resp != nil {
|
||||||
|
|
||||||
|
log.Print(api_error.Error.Message)
|
||||||
|
return false
|
||||||
|
//Unknown response format
|
||||||
|
} else if is_api_error != nil && api_resp != nil {
|
||||||
|
|
||||||
|
fmt.Print("Unknown response format")
|
||||||
|
fmt.Print(data)
|
||||||
|
return false
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
return !api_response.SearchResults.JobSearchError
|
||||||
|
|
||||||
//Parse data as json
|
|
||||||
err := json.Unmarshal([]byte(data), &result)
|
|
||||||
if err != nil {
|
|
||||||
log.Print("The Api did not return expected response")
|
|
||||||
log.Print("The following was the response from API")
|
|
||||||
log.Print(data)
|
|
||||||
panic(err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//Get value from key
|
|
||||||
key := "searchResults"
|
|
||||||
value := result[key]
|
|
||||||
|
|
||||||
//Check for errors
|
|
||||||
is_error := value.(map[string]interface{})["jobSearchError"]
|
|
||||||
|
|
||||||
return is_error == true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (u *UpworkPipeLine) validateResponse(data string) bool {
|
func (u *UpworkPipeLine) isResponseValid(data string) bool {
|
||||||
return isJSON(data) && !isApiError(data)
|
|
||||||
|
return isValidJSON(data) && isValidApiResponse(data)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (u *UpworkPipeLine) getTotalDocuments(urlArgs UrlArgs) (int, error) {
|
func (u *UpworkPipeLine) getTotalDocuments(urlArgs UrlArgs) (int, error) {
|
||||||
|
|
||||||
|
var API_Response UpworkApiResponse
|
||||||
client := u.upworkClient
|
client := u.upworkClient
|
||||||
url := client.ConstructUrl(urlArgs)
|
url := client.ConstructUrl(urlArgs)
|
||||||
resp, err := u.upworkClient.SendRequest(url)
|
resp, err := u.upworkClient.SendRequest(url)
|
||||||
@@ -122,12 +137,10 @@ func (u *UpworkPipeLine) getTotalDocuments(urlArgs UrlArgs) (int, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// check if response is valid
|
// check if response is valid
|
||||||
if !u.validateResponse(resp) {
|
if !u.isResponseValid(resp) {
|
||||||
return 0, fmt.Errorf("invalid response")
|
return 0, fmt.Errorf("invalid response")
|
||||||
}
|
}
|
||||||
|
|
||||||
var API_Response UpworkApiResponse
|
|
||||||
|
|
||||||
json.Unmarshal([]byte(resp), &API_Response)
|
json.Unmarshal([]byte(resp), &API_Response)
|
||||||
|
|
||||||
total_docs := API_Response.SearchResults.Paging.Total
|
total_docs := API_Response.SearchResults.Paging.Total
|
||||||
@@ -144,7 +157,7 @@ func (u *UpworkPipeLine) handleRequest(urlArgs UrlArgs, iteration int) {
|
|||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
|
|
||||||
//check if response is valid
|
//check if response is valid
|
||||||
} else if u.validateResponse(resp) {
|
} else if u.isResponseValid(resp) {
|
||||||
filename := fmt.Sprintf("data/%d.json", iteration)
|
filename := fmt.Sprintf("data/%d.json", iteration)
|
||||||
// Convert resp to array of bytes
|
// Convert resp to array of bytes
|
||||||
err = u.saveToFile([]byte(resp), filename)
|
err = u.saveToFile([]byte(resp), filename)
|
||||||
@@ -178,7 +191,13 @@ func (u *UpworkPipeLine) Run(query string) error {
|
|||||||
var perPage int
|
var perPage int
|
||||||
var total_docs int
|
var total_docs int
|
||||||
|
|
||||||
info_message := fmt.Sprintf("Finding Total Jobs for %s", query)
|
qum := query
|
||||||
|
|
||||||
|
if qum == "" {
|
||||||
|
qum = "Empty String"
|
||||||
|
}
|
||||||
|
|
||||||
|
info_message := fmt.Sprintf("Finding Total Jobs for %s", qum)
|
||||||
fmt.Println(info_message)
|
fmt.Println(info_message)
|
||||||
|
|
||||||
urlArgs := UrlArgs{
|
urlArgs := UrlArgs{
|
||||||
@@ -191,36 +210,34 @@ func (u *UpworkPipeLine) Run(query string) error {
|
|||||||
perPage = 50
|
perPage = 50
|
||||||
total_docs, err = u.getTotalDocuments(urlArgs)
|
total_docs, err = u.getTotalDocuments(urlArgs)
|
||||||
|
|
||||||
log.Print(total_docs)
|
if err != nil {
|
||||||
if err == nil {
|
log.Print("Could not retrive total number of jobs")
|
||||||
info_message := fmt.Sprintf("%s has a total of %d jobs", query, total_docs)
|
log.Fatal(err)
|
||||||
fmt.Println(info_message)
|
|
||||||
iteration = total_docs / perPage
|
|
||||||
|
|
||||||
if iteration >= 100 {
|
|
||||||
iteration = 100
|
|
||||||
}
|
|
||||||
|
|
||||||
info_message = fmt.Sprintf("A total of %d iterations will be performed", iteration)
|
|
||||||
fmt.Println(info_message)
|
|
||||||
|
|
||||||
//Found total iterations
|
|
||||||
u.handledataIteration(perPage, iteration, query)
|
|
||||||
err = u.CombineFiles()
|
|
||||||
os.RemoveAll("data")
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
|
|
||||||
} else {
|
|
||||||
log.Fatal("Could not retrive total number of jobs")
|
|
||||||
panic(err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
info_message = fmt.Sprintf("%s has a total of %d jobs", query, total_docs)
|
||||||
|
fmt.Println(info_message)
|
||||||
|
iteration = total_docs / perPage
|
||||||
|
|
||||||
|
if iteration >= 100 {
|
||||||
|
iteration = 100
|
||||||
|
}
|
||||||
|
|
||||||
|
info_message = fmt.Sprintf("A total of %d iterations will be performed", iteration)
|
||||||
|
fmt.Println(info_message)
|
||||||
|
|
||||||
|
//Found total iterations
|
||||||
|
u.handledataIteration(perPage, iteration, query)
|
||||||
|
err = u.CombineFiles()
|
||||||
|
os.RemoveAll("data")
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (u *UpworkPipeLine) handledataIteration(p_per int, iters int, query string) {
|
func (u *UpworkPipeLine) handledataIteration(p_per int, iters int, query string) {
|
||||||
|
|||||||
Reference in New Issue
Block a user